def train(opt): #print('decay', opt.num_decay_epochs) if torch.cuda.is_available(): torch.cuda.manual_seed(123) else: torch.manual_seed(123) if os.path.isdir(opt.log_path): shutil.rmtree(opt.log_path) os.makedirs(opt.log_path) writer = SummaryWriter(opt.log_path) env = Tetris(width=opt.width, height=opt.height, block_size=opt.block_size) #高さ、幅、1ブロックの大きさを指定 model = DeepQNetwork() #インスタンス生成 #optimizer = torch.optim.Adam(model.parameters(), lr=opt.lr) optimizer = torch.optim.SGD(model.parameters(), lr=opt.lr) criterion = nn.MSELoss() state = env.reset() # 初期状態 tensor([0., 0., 0., 0.]) if torch.cuda.is_available(): model.cuda() state = state.cuda() replay_memory = deque(maxlen=opt.replay_memory_size) #maxで30000、 epoch = 0 while epoch < opt.num_epochs: # 指定したエポック数繰り返す #1ピース目の取りうる全ての行動に対して、それぞれ状態を計算 {(左から何番目か,何回転か):tensor([,,,]),*n} next_steps = env.get_next_states() # εグリーディー的なやつ #epsilon = opt.final_epsilon + (max(opt.num_decay_epochs - epoch, 0) * ( #num_decay_epochs以降一定 # opt.initial_epsilon - opt.final_epsilon) / opt.num_decay_epochs) epsilon = opt.initial_epsilon - opt.initial_epsilon * epoch / opt.num_epochs #直線 u = random() # 0~1 random_action = u <= epsilon # True, False next_actions, next_states = zip( *next_steps.items()) #next_stepsのkeyとvalueを取得 #( , )*n next_states = torch.stack(next_states) # tensor([[ , , , ],*n]) if torch.cuda.is_available(): next_states = next_states.cuda() model.eval() with torch.no_grad(): predictions = model( next_states )[:, 0] #DeepQNetworkのforward #tensor([,~,])これはそれぞれの行動に対するQ値のようなもの model.train() # next_stepsのインデックスをランダムor最適で指定 if random_action: # ランダムな行動 index = randint(0, len(next_steps) - 1) else: # 最適な行動(最大のpredictionsに基づく) index = torch.argmax(predictions).item() # 行動と次の状態を決定 next_state = next_states[ index, :] #ある行動を選択したときの次の状態 #tensor([ , , , ]) action = next_actions[index] #行動 #(左から何番目か,何回転か) reward, done = env.step( action, epoch, render=False) #行動を実行、報酬(スコア)を求める、溢れた場合done=True、描画 if torch.cuda.is_available(): next_state = next_state.cuda() replay_memory.append( [state, reward, next_state, done] ) #deque([[tensor([0., 0., 0., 0.]), 1, tensor([0., 0., 2., 4.]), False]],..., maxlen=30000) if done: # 溢れた場合 or 上限100手 final_score = env.score final_tetrominoes = env.tetrominoes final_cleared_lines = env.cleared_lines cleared_lines1 = env.cleared_lines1 cleared_lines2 = env.cleared_lines2 cleared_lines3 = env.cleared_lines3 cleared_lines4 = env.cleared_lines4 state = env.reset() # 初期状態 tensor([0., 0., 0., 0.]) if torch.cuda.is_available(): state = state.cuda() else: # 溢れてない場合 state = next_state # 状態を更新 tensor([0., 1., 2., 5.])とか continue #while epoch~に戻る #if len(replay_memory) < opt.replay_memory_size / 1000: #溢れた場合判定(累計ピースが3000以下ならcontinue) #continue #pass # 累計ピースが3000に到達した後、溢れる毎に以下を実行 epoch += 1 batch = sample( replay_memory, min(len(replay_memory), opt.batch_size) ) #replay_memoryからbatch_size個ランダムに取り出す(len(replay_memory) < opt.batch_sizeのときはlen(replay_memory)個取り出す) replay_memory.clear() #中身を全消去 state_batch, reward_batch, next_state_batch, done_batch = zip(*batch) state_batch = torch.stack( tuple(state for state in state_batch)) #tensor([[0., 26., 16., 62.],*batch_size個]) reward_batch = torch.from_numpy( np.array(reward_batch, dtype=np.float32)[:, None]) #tensor([[1.],*batch_size個]) next_state_batch = torch.stack( tuple( state for state in next_state_batch)) #tensor([[0., 32., 13., 72.],*batch_size個]) if torch.cuda.is_available(): state_batch = state_batch.cuda() reward_batch = reward_batch.cuda() next_state_batch = next_state_batch.cuda() q_values = model( state_batch) #予測Q値、q_values=tensor([[0.1810],*batch_size個]) model.eval() with torch.no_grad(): next_prediction_batch = model(next_state_batch) #次の状態に対する予測Q値 model.train() # Q値の正解値を更新式で求める y_batch = torch.cat( tuple(reward if done else reward + opt.gamma * prediction for reward, done, prediction in zip( reward_batch, done_batch, next_prediction_batch)))[:, None] optimizer.zero_grad() #最適化アルゴリズム loss = criterion(q_values, y_batch) #損失関数はmse、q_values:予測値、y_batch:正解値 """ length = len(q_values) errors = np.zeros([length]) print('size', len(q_values), len(y_batch)) for i in range(length): print('Q', q_values[i]) print('Y', y_batch[i]) errors[i] = (q_values[i] - y_batch[i]) ** 2 error = np.mean(errors) print('error', error) print('loss',loss) """ loss.backward() optimizer.step() if epoch % 10 == 0: print( "Epoch: {}/{}, Action: {}, Score: {}, Tetrominoes {}, Cleared lines: {}" .format(epoch, opt.num_epochs, action, final_score, final_tetrominoes, final_cleared_lines)) #学習中のスコアをcsvに記録 if epoch == 1: with open('Score_train.csv', mode='w', newline="") as Score_train_Record: writer = csv.writer(Score_train_Record) writer.writerow([ epoch, final_tetrominoes, final_score, final_cleared_lines, cleared_lines1, cleared_lines2, cleared_lines3, cleared_lines4 ]) else: with open('Score_train.csv', mode='a', newline="") as Score_train_Record: writer = csv.writer(Score_train_Record) writer.writerow([ epoch, final_tetrominoes, final_score, final_cleared_lines, cleared_lines1, cleared_lines2, cleared_lines3, cleared_lines4 ]) """ writer.add_scalar('Train/Score', final_score, epoch - 1) writer.add_scalar('Train/Tetrominoes', final_tetrominoes, epoch - 1) writer.add_scalar('Train/Cleared lines', final_cleared_lines, epoch - 1) """ if epoch > 0 and epoch % opt.save_interval == 0: torch.save(model, "{}/tetris2_{}".format( opt.saved_path, epoch)) #定期的にモデルをtrained_modelsに保存 if final_tetrominoes > 500: #ミノ数が500を超えたモデルの重みとバイアスをcsvに保存 save_model_parameter(model) torch.save(model, "{}/tetris2".format(opt.saved_path)) #学習後のモデルをtrained_modelsに保存
def train(opt): if torch.cuda.is_available(): torch.cuda.manual_seed(123) else: torch.manual_seed(123) model = DeepQNetwork() model_target = DeepQNetwork() if os.path.isdir(opt.log_path): shutil.rmtree(opt.log_path) os.makedirs(opt.log_path) writer = SummaryWriter(opt.log_path) optimizer = torch.optim.Adam(model.parameters(), lr=1e-6) criterion = nn.MSELoss() game_state = FlappyBird() image, reward, terminal, score = game_state.next_frame(0) image = pre_processing( image[:game_state.screen_width, :int(game_state.base_y)], opt.image_size, opt.image_size) image = torch.from_numpy(image) if torch.cuda.is_available(): model.cuda() model_target.cuda() image = image.cuda() state = torch.cat(tuple(image for _ in range(4)))[None, :, :, :] model_target.eval() replay_memory = [] iter = 0 while iter < opt.num_iters: prediction = model(state)[0] # Exploration or exploitation epsilon = opt.final_epsilon + ( (opt.num_iters - iter) * (opt.initial_epsilon - opt.final_epsilon) / opt.num_iters) u = random() random_action = u <= epsilon if random_action: #print("Perform a random action") action = randint(0, 1) else: action = torch.argmax(prediction).item() next_image, reward, terminal, score = game_state.next_frame(action) next_image = pre_processing( next_image[:game_state.screen_width, :int(game_state.base_y)], opt.image_size, opt.image_size) next_image = torch.from_numpy(next_image) if torch.cuda.is_available(): next_image = next_image.cuda() next_state = torch.cat((state[0, 1:, :, :], next_image))[None, :, :, :] replay_memory.append([state, action, reward, next_state, terminal]) if len(replay_memory) > opt.replay_memory_size: del replay_memory[0] batch = sample(replay_memory, min(len(replay_memory), opt.batch_size)) state_batch, action_batch, reward_batch, next_state_batch, terminal_batch = zip( *batch) state_batch = torch.cat(tuple(state for state in state_batch)) action_batch = torch.from_numpy( np.array([[1, 0] if action == 0 else [0, 1] for action in action_batch], dtype=np.float32)) reward_batch = torch.from_numpy( np.array(reward_batch, dtype=np.float32)[:, None]) next_state_batch = torch.cat(tuple(state for state in next_state_batch)) if torch.cuda.is_available(): state_batch = state_batch.cuda() action_batch = action_batch.cuda() reward_batch = reward_batch.cuda() next_state_batch = next_state_batch.cuda() current_prediction_batch = model(state_batch) next_prediction_batch = model_target(next_state_batch) y_batch = torch.cat( tuple(reward if terminal else reward + opt.gamma * prediction[max_action] for reward, terminal, prediction, max_action in zip( reward_batch, terminal_batch, next_prediction_batch, torch.argmax(model(next_state_batch), axis=1)))) q_value = torch.sum(current_prediction_batch * action_batch, dim=1) optimizer.zero_grad() # y_batch = y_batch.detach() loss = criterion(q_value, y_batch) loss.backward() optimizer.step() state = next_state if iter % opt.target_update_freq == 0: model_target.load_state_dict(model.state_dict()) iter += 1 if iter % 100 == 0: print( "Test::Double Q: Iteration: {}/{}, Action: {}, Loss: {}, Epsilon {}, Reward: {}, Q-value: {}" .format(iter + 1, opt.num_iters, action, loss, epsilon, reward, torch.max(prediction))) writer.add_scalar('Train/Loss', loss, iter) writer.add_scalar('Train/Epsilon', epsilon, iter) writer.add_scalar('Train/Reward', reward, iter) writer.add_scalar('Train/Q-value', torch.max(prediction), iter) writer.add_scalar('Train/score', score, iter) if (iter + 1) % 1000000 == 0: torch.save(model, "{}/flappy_bird_{}".format(opt.saved_path, iter + 1)) torch.save(model, "{}/flappy_bird".format(opt.saved_path))
def train(opt): if torch.cuda.is_available(): # 随机数种子seed确定时,模型的训练结果将始终保持一致 torch.cuda.manual_seed(123) else: torch.manual_seed(123) if os.path.isdir(opt.log_path): shutil.rmtree(opt.log_path) os.makedirs(opt.log_path) writer = SummaryWriter(opt.log_path) env = Tetris(width=opt.width, height=opt.height, block_size=opt.block_size) model = DeepQNetwork() optimizer = torch.optim.Adam(model.parameters(), lr=opt.lr) criterion = nn.MSELoss() state = env.reset() if torch.cuda.is_available(): model.cuda() state = state.cuda() replay_memory = deque(maxlen=opt.replay_memory_size) epoch = 0 while epoch < opt.num_epochs: # 得到所有可能的下落方块 next_steps = env.get_next_states() # Exploration or exploitation epsilon = opt.final_epsilon + ( max(opt.num_decay_epochs - epoch, 0) * (opt.initial_epsilon - opt.final_epsilon) / opt.num_decay_epochs) u = random() random_action = u <= epsilon # 下一步落下的横向坐标以及旋转,以及得到下方方块的board状态 next_actions, next_states = zip(*next_steps.items()) next_states = torch.stack(next_states) if torch.cuda.is_available(): next_states = next_states.cuda() model.eval() with torch.no_grad(): predictions = model(next_states)[:, 0] model.train() # 采取的动作 if random_action: index = randint(0, len(next_steps) - 1) else: index = torch.argmax(predictions).item() next_state = next_states[index, :] action = next_actions[index] reward, done = env.step(action, render=False) if torch.cuda.is_available(): next_state = next_state.cuda() replay_memory.append([state, reward, next_state, done]) if done: final_score = env.score final_tetrominoes = env.tetrominoes final_cleared_lines = env.cleared_lines state = env.reset() if torch.cuda.is_available(): state = state.cuda() else: state = next_state continue if len(replay_memory) < opt.replay_memory_size / 10: continue epoch += 1 batch = sample(replay_memory, min(len(replay_memory), opt.batch_size)) ''' a = [2, 3, 4], b = [5, 6, 7], c = [a, b] e, f, g = zip(*c) e = (2, 5), f = (3, 6), g = (4, 7) 类型为tuple ''' state_batch, reward_batch, next_state_batch, done_batch = zip(*batch) state_batch = torch.stack(tuple(state for state in state_batch)) reward_batch = torch.from_numpy( np.array(reward_batch, dtype=np.float32)[:, None]) next_state_batch = torch.stack( tuple(state for state in next_state_batch)) if torch.cuda.is_available(): state_batch = state_batch.cuda() reward_batch = reward_batch.cuda() next_state_batch = next_state_batch.cuda() q_values = model(state_batch) model.eval() # Q_target with torch.no_grad(): next_prediction_batch = model(next_state_batch) model.train() y_batch = torch.cat( tuple(reward if done else reward + opt.gamma * prediction for reward, done, prediction in zip( reward_batch, done_batch, next_prediction_batch)))[:, None] optimizer.zero_grad() loss = criterion(q_values, y_batch) loss.backward() optimizer.step() print( "Epoch: {}/{}, Action: {}, Score: {}, Tetrominoes {}, Cleared lines: {}" .format(epoch, opt.num_epochs, action, final_score, final_tetrominoes, final_cleared_lines)) writer.add_scalar('Train/Score', final_score, epoch - 1) writer.add_scalar('Train/Tetrominoes', final_tetrominoes, epoch - 1) writer.add_scalar('Train/Cleared lines', final_cleared_lines, epoch - 1) if epoch > 0 and epoch % opt.save_interval == 0: torch.save(model, "{}/tetris_{}".format(opt.saved_path, epoch)) torch.save(model, "{}/tetris".format(opt.saved_path))
def test(opt, conv1, conv2, conv3): if torch.cuda.is_available(): torch.cuda.manual_seed(123) else: torch.manual_seed(123) # if torch.cuda.is_available(): # model = torch.load("{}/tetris".format(opt.saved_path)) # else: # model = torch.load("{}/tetris".format(opt.saved_path), map_location=lambda storage, loc: storage) model = DeepQNetwork() model.eval() if False: # save weights ii = 1 for layer in model.modules(): if isinstance(layer, nn.Linear): if ii == 1: weights1 = layer.weight.cpu() weights1 = weights1.detach().numpy() pd.DataFrame(weights1).to_csv( 'trained_models/conv{}.csv'.format(ii)) if ii == 2: weights2 = layer.weight.cpu() weights2 = weights2.detach().numpy() pd.DataFrame(weights2).to_csv( 'trained_models/conv{}.csv'.format(ii)) if ii == 3: weights3 = layer.weight.cpu() weights3 = weights3.detach().numpy() pd.DataFrame(weights3).to_csv( 'trained_models/conv{}.csv'.format(ii)) ii += 1 if False: # load csv weights ii = 1 for layer in model.modules(): if isinstance(layer, nn.Linear): with torch.no_grad(): if ii == 1: layer.weight.data = torch.Tensor(conv1).cuda() if ii == 2: layer.weight.data = torch.Tensor(conv2).cuda() if ii == 3: layer.weight.data = torch.Tensor(conv1).cuda() ii += 1 env = Tetris(width=opt.width, height=opt.height, block_size=opt.block_size) env.reset() if torch.cuda.is_available(): model.cuda() out = cv2.VideoWriter( opt.output, cv2.VideoWriter_fourcc(*"MJPG"), opt.fps, (int(1.5 * opt.width * opt.block_size), opt.height * opt.block_size)) while True: next_steps = env.get_next_states() next_actions, next_states = zip(*next_steps.items()) next_states = torch.stack(next_states) if torch.cuda.is_available(): next_states = next_states.cuda() predictions = model(next_states)[:, 0] index = torch.argmax(predictions).item() action = next_actions[index] result, done = env.step(action, render=True, video=out) if done: out.release() return result
def train(opt): if torch.cuda.is_available(): torch.cuda.manual_seed(123) else: torch.manual_seed(123) # TensorBoard if os.path.isdir(opt.log_path): shutil.rmtree(opt.log_path) os.makedirs(opt.log_path) writer = SummaryWriter(opt.log_path) # Modelo CHECKPOINT_FILE = opt.saved_path + "/" + opt.checkpoint_name if opt.load: if os.path.isfile(CHECKPOINT_FILE): print("--> Carregando Checkpoint '{}'.".format(CHECKPOINT_FILE)) if torch.cuda.is_available(): model = torch.load(CHECKPOINT_FILE) else: model = torch.load(CHECKPOINT_FILE, map_location=lambda storage, loc: storage) print("--> Checkpoint Carregado '{}'.".format(CHECKPOINT_FILE)) else: print("--> Checkpoint '{}' não encontrado.".format(CHECKPOINT_FILE)) model = DeepQNetwork() else: model = DeepQNetwork() optimizer = torch.optim.Adam(model.parameters(), lr=opt.lr) criterion = nn.MSELoss() # Environment env = Tetris(width=opt.width, height=opt.height) state = env.reset() if torch.cuda.is_available(): model.cuda() state = state.cuda() replay_memory = deque(maxlen=opt.replay_memory_size) epoch = 0 prev_loss = 0 # Épocas do Checkpoint if opt.load and "_" in opt.checkpoint_name: start_epoch = opt.checkpoint_name.split("_")[-1] epoch = int(start_epoch) print("Checkpoint com {} épocas.".format(epoch)) # Loop de Treino while epoch < opt.num_epochs: next_steps = env.get_next_states() # Exploração ou Explotação epsilon = opt.final_epsilon + (max(opt.num_decay_epochs - epoch, 0) * ( opt.initial_epsilon - opt.final_epsilon) / opt.num_decay_epochs) u = random() random_action = u <= epsilon next_actions, next_states = zip(*next_steps.items()) next_states = torch.stack(next_states) if torch.cuda.is_available(): next_states = next_states.cuda() model.eval() with torch.no_grad(): predictions = model(next_states)[:, 0] model.train() if random_action: index = randint(0, len(next_steps) - 1) else: index = torch.argmax(predictions).item() next_state = next_states[index, :] action = next_actions[index] reward, done = env.step(action, render=True) if torch.cuda.is_available(): next_state = next_state.cuda() replay_memory.append([state, reward, next_state, done]) if done: final_score = env.score final_tetrominoes = env.tetrominoes final_cleared_lines = env.cleared_lines state = env.reset() if torch.cuda.is_available(): state = state.cuda() else: state = next_state continue # Replay Buffer if len(replay_memory) < opt.replay_memory_size / 10: print("replay_memory ", len(replay_memory)) continue epoch += 1 batch = sample(replay_memory, min(len(replay_memory), opt.batch_size)) state_batch, reward_batch, next_state_batch, done_batch = zip(*batch) state_batch = torch.stack(tuple(state for state in state_batch)) reward_batch = torch.from_numpy(np.array(reward_batch, dtype=np.float32)[:, None]) next_state_batch = torch.stack(tuple(state for state in next_state_batch)) # Aprendizado if torch.cuda.is_available(): state_batch = state_batch.cuda() reward_batch = reward_batch.cuda() next_state_batch = next_state_batch.cuda() q_values = model(state_batch) model.eval() with torch.no_grad(): next_prediction_batch = model(next_state_batch) model.train() y_batch = torch.cat( tuple(reward if done else reward + opt.gamma * prediction for reward, done, prediction in zip(reward_batch, done_batch, next_prediction_batch)))[:, None] optimizer.zero_grad() loss = criterion(q_values, y_batch) loss.backward() optimizer.step() prev_loss = loss.item() print("Epoch: {}/{}, Action: {}, Score: {}, Tetrominoes {}, Cleared lines: {}".format( epoch, opt.num_epochs, action, final_score, final_tetrominoes, final_cleared_lines)) writer.add_scalar('Train/Score', final_score, epoch - 1) writer.add_scalar('Train/Tetrominoes', final_tetrominoes, epoch - 1) writer.add_scalar('Train/Cleared lines', final_cleared_lines, epoch - 1) if epoch > 0 and epoch % opt.save_interval == 0: torch.save(model, "{}/{}_{}".format(opt.saved_path, opt.saved_name, epoch)) torch.save(model, "{}/{}".format(opt.saved_path, opt.saved_name))
def train(opt): cv2.setUseOptimized(True) print("cv2 is optimized =", cv2.useOptimized()) print("cuda available =", torch.cuda.is_available()) if torch.cuda.is_available(): torch.cuda.manual_seed(123) else: torch.manual_seed(123) if os.path.isdir(opt.log_path): shutil.rmtree(opt.log_path) os.makedirs(opt.log_path) writer = SummaryWriter(opt.log_path) ############# # The environment in this situation is the Tetris game itself # Create a Tetris object with the constructor in src/tetris.py and pass in the 3 arguments described there # the arguments default values are stored in "opt" which comes from the argument parser above ############# env = #..... model = DeepQNetwork() optimizer = torch.optim.Adam(model.parameters(), lr=opt.lr) criterion = nn.MSELoss() state = env.reset() if torch.cuda.is_available(): model.cuda() state = state.cuda() replay_memory = deque(maxlen=opt.replay_memory_size) epoch = 0 while epoch < opt.num_epochs: next_steps = env.get_next_states() # Exploration or exploitation ############# # epsilon = A threshhold that decides how likely it is that a random action is performed. also often calles "eps_threshold" # Insert a function that creates a decaying value per epoch, between 1 and 0. amount of epochs is defined as input, and default in parser(top of this file) # The best results will probably come from a function that decays in the first x epochs and is = "some final low value, e.g. 0.001" for the last y epochs ############# epsilon = #..... ############# # u = a random number between 0 and 1 that randomly decides whether a random action is performed or an action chosen by the model ############# u = #..... random_action = u <= epsilon next_actions, next_states = zip(*next_steps.items()) next_states = torch.stack(next_states) # if cuda is available, move network to GPU if torch.cuda.is_available(): next_states = next_states.cuda() model.#...... evaluate model, built in function with torch.no_grad(): predictions = model(next_states)[:, 0] model.train() if random_action: index = randint(0, len(next_steps) - 1) else: index = torch.argmax(predictions).item() next_state = next_states[index, :] action = next_actions[index] ############# # reward = metric of how well the model has performed # done = whether or not the model is finished with the current game # a function that outputs both of these state variables can be found in src/tetris.py # remeber to pass in the second argument as well, otherwise all training will be done with visualization which is cool to watch, but slow ############# reward, done = env.#..... if torch.cuda.is_available(): next_state = next_state.cuda() replay_memory.append([state, reward, next_state, done]) if done: final_score = env.score final_tetrominoes = env.tetrominoes final_cleared_lines = env.cleared_lines state = env.reset() if torch.cuda.is_available(): state = state.cuda() else: state = next_state continue if len(replay_memory) < opt.replay_memory_size / 10: continue #..... Increment epoch counter batch = sample(replay_memory, min(len(replay_memory), opt.batch_size)) state_batch, reward_batch, next_state_batch, done_batch = zip(*batch) state_batch = torch.stack(tuple(state for state in state_batch)) reward_batch = torch.from_numpy(np.array(reward_batch, dtype=np.float32)[:, None]) next_state_batch = torch.stack(tuple(state for state in next_state_batch)) # if cuda is available, move network to GPU if torch.cuda.is_available(): state_batch = state_batch.cuda() reward_batch = reward_batch.cuda() next_state_batch = next_state_batch.cuda() q_values = model(state_batch) model.#...... evaluate model, built in function with torch.no_grad(): next_prediction_batch = model(next_state_batch) model.train() y_batch = torch.cat( tuple(reward if done else reward + opt.gamma * prediction for reward, done, prediction in zip(reward_batch, done_batch, next_prediction_batch)))[:, None] ############# # Optimizers are algorithms or methods used to change the attributes of your neural network such as weights and learning rate in order to reduce the losses # # In order to ensure that behaviour learned from one epoch is not used in multiple epochs to tune a network(accumulating gradients), # we need to set all gradients to zero, this is a built in function ############# optimizer.#..... ############# # A criterion is a loss function that compares two inputs as tensors(pytorch datatype) # # Pass in two tensors into our criterion(loss function, set to MSELoss()) ############# loss = criterion()#(tensorX, tensorY) ############# # The criterion outputs a loss object that stores the loss value and enables us to tune our neural network by backpropagation # # The loss function has access to a built in function that does this ############# loss.#..... ############# # Optimizers are algorithms or methods used to change the attributes of your neural network such as weights and learning rate in order to reduce the losses # # The optimizer should her e perform a parameter update based on the current gradient, and does this by a built in function ############# optimizer.#..... print("Epoch: {}/{}, Action: {}, Score: {}, Tetrominoes {}, Cleared lines: {}".format( epoch, opt.num_epochs, action, final_score, final_tetrominoes, final_cleared_lines)) writer.add_scalar('Train/Score', final_score, epoch - 1) writer.add_scalar('Train/Tetrominoes', final_tetrominoes, epoch - 1) writer.add_scalar('Train/Cleared lines', final_cleared_lines, epoch - 1) if epoch > 0 and epoch % opt.save_interval == 0: torch.save(model, "{}/tetris_{}".format(opt.saved_path, epoch)) ############# # PyTorch offers a way of saving either full models or a model state's weights as a file # # Search on pytorch.org for this save function ############# torch.#.....
def train(opt): if torch.cuda.is_available(): torch.cuda.manual_seed(123) else: torch.manual_seed(123) if os.path.isdir(opt.log_path): shutil.rmtree(opt.log_path) os.makedirs(opt.log_path) writer = SummaryWriter(opt.log_path) env = Tetris(width=opt.width, height=opt.height, block_size=opt.block_size) model = DeepQNetwork() model_target = DeepQNetwork() optimizer = torch.optim.Adam(model.parameters(), lr=opt.lr) criterion = nn.MSELoss() state = env.reset() if torch.cuda.is_available(): model.cuda() model_target.cuda() state = state.cuda() if opt.PER: replay_memory = Memory(capacity=opt.replay_memory_size) else: replay_memory = deque(maxlen=opt.replay_memory_size) epoch = 0 warmup_epoch = 0 while epoch < opt.num_epochs: next_steps = env.get_next_states() # Exploration or exploitation epsilon = opt.final_epsilon + ( max(opt.num_decay_epochs - epoch, 0) * (opt.initial_epsilon - opt.final_epsilon) / opt.num_decay_epochs) u = random() random_action = u <= epsilon next_actions, next_states = zip(*next_steps.items()) next_states = torch.stack(next_states) if torch.cuda.is_available(): next_states = next_states.cuda() model.eval() with torch.no_grad(): predictions = model(next_states)[:, 0] model.train() if random_action: index = randint(0, len(next_steps) - 1) else: index = torch.argmax(predictions).item() next_state = next_states[index, :] action = next_actions[index] reward, done = env.step(action, render=True) if torch.cuda.is_available(): next_state = next_state.cuda() if opt.PER: experience = state, action, reward, next_state, done replay_memory.store(experience) else: replay_memory.append([state, reward, next_state, done]) if done: final_score = env.score final_tetrominoes = env.tetrominoes final_cleared_lines = env.cleared_lines state = env.reset() if torch.cuda.is_available(): state = state.cuda() else: state = next_state continue warmup_epoch += 1 if warmup_epoch < opt.learning_starts: continue epoch += 1 if opt.PER: tree_idx, batch = replay_memory.sample(opt.batch_size) else: batch = sample(replay_memory, min(len(replay_memory), opt.batch_size)) state_batch, _, reward_batch, next_state_batch, done_batch = zip( *batch) state_batch = torch.stack(tuple(state for state in state_batch)) reward_batch = torch.from_numpy( np.array(reward_batch, dtype=np.float32)[:, None]) next_state_batch = torch.stack( tuple(state for state in next_state_batch)) if torch.cuda.is_available(): state_batch = state_batch.cuda() reward_batch = reward_batch.cuda() next_state_batch = next_state_batch.cuda() q_values = model(state_batch) model_target.eval() with torch.no_grad(): next_prediction_batch = model_target(next_state_batch) model_target.train() y_batch = torch.cat( tuple(reward if done else reward + opt.gamma * prediction for reward, done, prediction in zip( reward_batch, done_batch, next_prediction_batch)))[:, None] optimizer.zero_grad() loss = criterion(q_values, y_batch) loss.backward() optimizer.step() model.eval() model_target.eval() if opt.PER: with torch.no_grad(): if torch.cuda.is_available(): replay_memory.batch_update( tree_idx, np.abs(q_values.detach().cpu().numpy() - y_batch.cpu().numpy())) else: replay_memory.batch_update( tree_idx, np.abs(q_values.detach().numpy() - y_batch.numpy())) # Update target model <- model if epoch % opt.target_update_freq == 0: with torch.no_grad(): model_target.load_state_dict(model.state_dict()) model_target.train() model.eval() print( "Epoch: {}/{}, Action: {}, Score: {}, Tetrominoes {}, Cleared lines: {}" .format(epoch, opt.num_epochs, action, final_score, final_tetrominoes, final_cleared_lines)) writer.add_scalar('Train/Score', final_score, epoch - 1) writer.add_scalar('Train/Tetrominoes', final_tetrominoes, epoch - 1) writer.add_scalar('Train/Cleared lines', final_cleared_lines, epoch - 1) if (epoch > 0 and epoch % opt.save_interval) == 0 or final_score >= 10000.0: torch.save(model, "{}/tetris_{}".format(opt.saved_path, epoch)) torch.save(model, "{}/tetris".format(opt.saved_path))
def train(opt): # Set random seed if torch.cuda.is_available(): torch.cuda.manual_seed(opt.random_seed) else: torch.manual_seed(opt.random_seed) # Instantiate the model if opt.conv_dim is not None and \ opt.conv_kernel_sizes is not None and \ opt.conv_strides is not None and \ opt.fc_dim is not None: model = DeepQNetwork(opt.image_size, opt.image_size, conv_dim=opt.conv_dim, conv_kernel_sizes=opt.conv_kernel_sizes, conv_strides=opt.conv_strides, fc_dim=opt.fc_dim) else: model = DeepQNetwork(opt.image_size, opt.image_size) if opt.log_comet_ml: # Create a Comet.ml experiment experiment = Experiment(api_key=opt.comet_ml_api_key, project_name=opt.comet_ml_project_name, workspace=opt.comet_ml_workspace) experiment.log_other("iters_to_save", opt.iters_to_save) experiment.log_other("completed", False) experiment.log_other("random_seed", opt.random_seed) # Report hyperparameters to Comet.ml hyper_params = { "image_size": opt.image_size, "batch_size": opt.batch_size, "optimizer": opt.optimizer, "learning_rate": opt.lr, "gamma": opt.gamma, "initial_epsilon": opt.initial_epsilon, "final_epsilon": opt.final_epsilon, "num_iters": opt.num_iters, "replay_memory_size": opt.replay_memory_size, "random_seed": opt.random_seed, "conv_dim": opt.conv_dim, "conv_kernel_sizes": opt.conv_kernel_sizes, "conv_strides": opt.conv_strides, "fc_dim": opt.fc_dim } experiment.log_parameters(hyper_params) optimizer = torch.optim.Adam(model.parameters(), lr=1e-6) # Optimization algorithm criterion = nn.MSELoss() # Loss function game_state = FlappyBird() # Instantiate the Flappy Compass game image, reward, terminal = game_state.next_frame( 0 ) # Get the next image, along with its reward and an indication if it's a terminal state # Image preprocessing step (scaling, color removal and convertion to a PyTorch tensor) image = pre_processing( image[:game_state.screen_width, :int(game_state.base_y)], opt.image_size, opt.image_size) image = torch.from_numpy(image) # Move the model and the current image data to the GPU, if available if torch.cuda.is_available(): model.cuda() image = image.cuda() # Prepare the state variable, which will host the last 4 frames state = torch.cat(tuple(image for _ in range(4)))[None, :, :, :] # Initialize the replay memory, which saves sets of consecutive game states, the reward and terminal state indicator # so that the model can learn from them (essentially constitutes the training data, which grows with every new iteration) replay_memory = [] iter = 0 # Iteration counter # Main training loop performing the number of iterations specified by num_iters while iter < opt.num_iters: prediction = model(state)[0] # Get a prediction from the current state epsilon = opt.final_epsilon + ( (opt.num_iters - iter) * (opt.initial_epsilon - opt.final_epsilon) / opt.num_iters ) # Set the decay of the probability of random actions u = random() random_action = u <= epsilon if random_action: print("Perform a random action") action = randint(0, 1) else: # Use the model's prediction to decide the next action action = torch.argmax(prediction).item() # Get a new frame and process it next_image, reward, terminal = game_state.next_frame(action) next_image = pre_processing( next_image[:game_state.screen_width, :int(game_state.base_y)], opt.image_size, opt.image_size) next_image = torch.from_numpy(next_image) # Move the next image data to the GPU, if available if torch.cuda.is_available(): next_image = next_image.cuda() next_state = torch.cat( (state[0, 1:, :, :], next_image) )[None, :, :, :] # Prepare the next state variable, which will host the last 4 frames replay_memory.append( [state, action, reward, next_state, terminal] ) # Save the current state, action, next state and terminal state indicator in the replay memory if len(replay_memory) > opt.replay_memory_size: del replay_memory[ 0] # Delete the oldest reolay from memory if full capacity has been reached batch = sample(replay_memory, min(len(replay_memory), opt.batch_size) ) # Retrieve past play sequences from the replay memory state_batch, action_batch, reward_batch, next_state_batch, terminal_batch = zip( *batch) state_batch = torch.cat(tuple( state for state in state_batch)) # States of the current batch action_batch = torch.from_numpy( np.array([[1, 0] if action == 0 else [0, 1] for action in action_batch], dtype=np.float32)) # Actions taken in the current batch reward_batch = torch.from_numpy( np.array(reward_batch, dtype=np.float32)[:, None]) # Rewards in the current batch next_state_batch = torch.cat(tuple( state for state in next_state_batch)) # Next states of the current batch # Move batch data to the GPU, if available if torch.cuda.is_available(): state_batch = state_batch.cuda() action_batch = action_batch.cuda() reward_batch = reward_batch.cuda() next_state_batch = next_state_batch.cuda() current_prediction_batch = model( state_batch ) # Predictions of the model for the replays of the current batch next_prediction_batch = model( next_state_batch ) # Next predictions of the model for the replays of the current batch # Set ground truth for the rewards for the current batch, considering whether the state is terminal or not y_batch = torch.cat( tuple(reward if terminal else reward + opt.gamma * torch.max(prediction) for reward, terminal, prediction in zip( reward_batch, terminal_batch, next_prediction_batch))) q_value = torch.sum( current_prediction_batch * action_batch, dim=1 ) # Predicted Q values (i.e. estimated return for each action) optimizer.zero_grad( ) # Reset the gradients to zero before a new optimization step loss = criterion(q_value, y_batch) # Calculate the loss loss.backward() # Backpropagation optimizer.step() # Weights optimization step state = next_state # Move to the next frame iter += 1 print( "Iteration: {}/{}, Action: {}, Loss: {}, Epsilon {}, Reward: {}, Q-value: {}" .format(iter + 1, opt.num_iters, action, loss, epsilon, reward, torch.max(prediction))) if opt.log_comet_ml: # Log metrics to Comet.ml experiment.log_metric("train_loss", loss, step=iter) experiment.log_metric("train_epsilon", epsilon, step=iter) experiment.log_metric("train_reward", reward, step=iter) experiment.log_metric("train_Q_value", torch.max(prediction), step=iter) if (iter + 1) % opt.iters_to_save == 0: # Get the current day and time to attach to the saved model's name current_datetime = datetime.now().strftime('%d_%m_%Y_%H_%M') # Set saved model name model_filename = f'{opt.saved_path}/flappy_compass_{current_datetime}_{iter+1}.pth' # Save model every iters_to_save iterations torch.save(model, model_filename) if opt.log_comet_ml and opt.comet_ml_save_model: # Upload model to Comet.ml experiment.log_asset(file_path=model_filename, overwrite=True) # Get the current day and time to attach to the saved model's name current_datetime = datetime.now().strftime('%d_%m_%Y_%H_%M') # Set saved model name model_filename = f'{opt.saved_path}/flappy_compass_{current_datetime}_{iter+1}.pth' # Save the model after reaching the final iteration torch.save(model, model_filename) if opt.log_comet_ml: # Only report that the experiment completed successfully if it finished the training without errors experiment.log_other("completed", True) if opt.comet_ml_save_model: # Upload model to Comet.ml experiment.log_asset(file_path=model_filename, overwrite=True)
def train(opt): if torch.cuda.is_available(): torch.cuda.manual_seed(123) else: torch.manual_seed(123) model = DeepQNetwork() if os.path.isdir(opt.log_path): shutil.rmtree(opt.log_path) os.makedirs(opt.log_path) writer = SummaryWriter(opt.log_path) optimizer = torch.optim.Adam(model.parameters(), lr=1e-6) criterion = nn.MSELoss() game_state = NS_SHAFT() image, reward, terminal = game_state.next_frame(0) imgplot = plt.imshow(image) image = pre_processing(image[:game_state.screen_width, :int(400)], opt.image_size, opt.image_size) image = torch.from_numpy(image) if torch.cuda.is_available(): model.cuda() image = image.cuda() state = torch.cat(tuple(image for _ in range(4)))[None, :, :, :] replay_memory = [] iter = 0 while iter < opt.num_iters: #plt.plot([1,2,3,4]) prediction = model(state) # Exploration or exploitation epsilon = opt.final_epsilon + ( (opt.num_iters - iter) * (opt.initial_epsilon - opt.final_epsilon) / opt.num_iters) u = random() random_action = u <= epsilon if random_action: print("Perform a random action") action = randint(-1, 1) * 4 else: #print('prediction',prediction) action = (prediction.data.max(1)[1].item() - 1) * 4 #print('a',action) next_image, reward, terminal = game_state.next_frame(action) next_image = pre_processing( next_image[:game_state.screen_width, :int(400)], opt.image_size, opt.image_size) next_image = torch.from_numpy(next_image) if torch.cuda.is_available(): next_image = next_image.cuda() next_state = torch.cat((state[0, 1:, :, :], next_image))[None, :, :, :] replay_memory.append([state, action, reward, next_state, terminal]) if len(replay_memory) > opt.replay_memory_size: del replay_memory[0] batch = sample(replay_memory, min(len(replay_memory), opt.batch_size)) state_batch, action_batch, reward_batch, next_state_batch, terminal_batch = zip( *batch) state_batch = torch.cat(tuple(state for state in state_batch)) action_batch_tmp = [] for action in action_batch: if action == -4: action_batch_tmp.append([1, 0, 0]) elif action == 0: action_batch_tmp.append([0, 1, 0]) else: action_batch_tmp.append([0, 0, 1]) action_batch = torch.from_numpy( np.array(action_batch_tmp, dtype=np.float32)) reward_batch = torch.from_numpy( np.array(reward_batch, dtype=np.float32)[:, None]) next_state_batch = torch.cat(tuple(state for state in next_state_batch)) if torch.cuda.is_available(): state_batch = state_batch.cuda() action_batch = action_batch.cuda() reward_batch = reward_batch.cuda() next_state_batch = next_state_batch.cuda() current_prediction_batch = model(state_batch) next_prediction_batch = model(next_state_batch) y_batch = torch.cat( tuple(reward if terminal else reward + opt.gamma * torch.max(prediction) for reward, terminal, prediction in zip( reward_batch, terminal_batch, next_prediction_batch))) q_value = torch.sum(current_prediction_batch * action_batch, dim=1) optimizer.zero_grad() # y_batch = y_batch.detach() loss = criterion(q_value, y_batch) loss.backward() optimizer.step() state = next_state iter += 1 print( "Iteration: {}/{}, Action: {}, Loss: {}, Epsilon {}, Reward: {}, Q-value: {}" .format(iter + 1, opt.num_iters, action, loss, epsilon, reward, torch.max(prediction))) writer.add_scalar('Train/Loss', loss, iter) writer.add_scalar('Train/Epsilon', epsilon, iter) writer.add_scalar('Train/Reward', reward, iter) writer.add_scalar('Train/Q-value', torch.max(prediction), iter) if iter + 1 == opt.num_iters: torch.save(model, "{}/ns_shaft_{}".format(opt.saved_path, iter + 1)) torch.save(model, "{}/ns_shaft".format(opt.saved_path))
def training(arguments): if torch.cuda.is_available(): torch.cuda.manual_seed(123) else: torch.manual_seed(123) model = DeepQNetwork() if os.path.isdir(arguments.log_path): shutil.rmtree(arguments.log_path) os.makedirs(arguments.log_path) writer = SummaryWriter(arguments.log_path) optimiser = torch.optim.Adam(model.parameters(), lr=1e-6) criterion = nn.MSELoss() gameState = Flappyplayer() image, reward, terminal = gameState.next_frame(0) image = pre_processing(image[:gameState.SCREENW, :int(gameState.base_y)], arguments.image_size, arguments.image_size) image = torch.from_numpy(image) if torch.cuda.is_available(): model.cuda() image = image.cuda() state = torch.cat(tuple(image for _ in range(4)))[None, :, :, :] replay_mem = [] iter = 0 while iter < arguments.iters: prediction = model(state)[0] # Exploration or exploitation epsilon = arguments.final_epsilon + ( (arguments.iters - iter) * (arguments.initial_epsilon - arguments.final_epsilon) / arguments.iters) u = random() random_action = u <= epsilon if random_action: print("Perform a random action") action = 1 print(action) else: action = 0 nextImage, reward, terminal = gameState.next_frame(action) nextImage = pre_processing(nextImage[:gameState.SCREENW, :int(gameState.base_y)], arguments.image_size, arguments.image_size) nextImage = torch.from_numpy(nextImage) if torch.cuda.is_available(): nextImage = nextImage.cuda() nextState = torch.cat((state[0, 1:, :, :], nextImage))[None, :, :, :] replay_mem.append([state, action, reward, nextState, terminal]) if len(replay_mem) > arguments.replay_mem: del replay_mem[0] batch = sample(replay_mem, min(len(replay_mem), arguments.batch_size)) stateBatch, actionBatch, rewardBatch, nextStateBatch, terminalBatch = zip(*batch) stateBatch = torch.cat(tuple(state for state in stateBatch)) actionBatch = torch.from_numpy(np.array([[1,0] if action == 0 else [0,1] for action in actionBatch], dtype=np.float32)) rewardBatch = torch.from_numpy(np.array(rewardBatch, dtype=np.float32)[:, None]) nextStateBatch = torch.cat(tuple(state for state in nextStateBatch)) if torch.cuda.is_available(): stateBatch = stateBatch.cuda() actionBatch = actionBatch.cuda() rewardBatch = rewardBatch.cuda() nextStateBatch = nextStateBatch.cuda() currentPredBatch = model(stateBatch) nextPredBatch = model(nextStateBatch) yBatch = torch.cat(tuple(reward if terminal else reward + arguments.gamma * torch.max(prediction) for reward, terminal, prediction in zip(rewardBatch, terminalBatch, nextPredBatch))) qValue = torch.sum(currentPredBatch*actionBatch, dim=1) optimiser.zero_grad() loss = criterion(qValue, yBatch) loss.backward() optimiser.step() state = nextState iter +=1 print("Iteration: {}/{}, Action: {}, Loss: {}, Epsilon: {}, Reward: {}, Q-Value: {}".format(iter+1,arguments.iters, action, loss, epsilon, reward, torch.max(prediction))) writer.add_scalar('Train/Loss', loss, iter) writer.add_scalar('Train/Epsilon', epsilon, iter) writer.add_scalar('Train/Reward', reward, iter) writer.add_scalar('Train/Q-Value', torch.max(prediction), iter) if (iter+1) % 1000000 == 0: torch.save(model, "{}/flappy_bird_{}".format(arguments.saved_path, iter+1)) torch.save(model, "{}/flappy_bird".format(arguments.saved_path))