Python DeepQNetworkの例

プログラミング言語: Python

名前空間/パッケージ名: src.deep_q_network

クラス/型: DeepQNetwork

hotexamples.comのコード掲載数: 10

Python DeepQNetwork - 10件のコード例が見つかりました。すべてオープンソースプロジェクトから抽出されたPythonのsrc.deep_q_network.DeepQNetworkの実例で、最も評価が高いものを厳選しています。コード例の評価を行っていただくことで、より質の高いコード例が表示されるようになります。

よく使われるメソッド

表示非表示

DeepQNetwork(10)

parameters(8)

eval(6)

train(4)

cuda(3)

load_state_dict(2)

state_dict(2)

modules(1)

コード例 #1

ファイルを表示

ファイル: train.py プロジェクト: afda1021/dqn_tetris

def train(opt):
    #print('decay', opt.num_decay_epochs)
    if torch.cuda.is_available():
        torch.cuda.manual_seed(123)
    else:
        torch.manual_seed(123)
    if os.path.isdir(opt.log_path):
        shutil.rmtree(opt.log_path)
    os.makedirs(opt.log_path)
    writer = SummaryWriter(opt.log_path)
    env = Tetris(width=opt.width, height=opt.height,
                 block_size=opt.block_size)  #高さ、幅、1ブロックの大きさを指定
    model = DeepQNetwork()  #インスタンス生成
    #optimizer = torch.optim.Adam(model.parameters(), lr=opt.lr)
    optimizer = torch.optim.SGD(model.parameters(), lr=opt.lr)
    criterion = nn.MSELoss()

    state = env.reset()  # 初期状態　tensor([0., 0., 0., 0.])
    if torch.cuda.is_available():
        model.cuda()
        state = state.cuda()

    replay_memory = deque(maxlen=opt.replay_memory_size)  #maxで30000、
    epoch = 0
    while epoch < opt.num_epochs:  # 指定したエポック数繰り返す
        #1ピース目の取りうる全ての行動に対して、それぞれ状態を計算  {(左から何番目か,何回転か):tensor([,,,]),*n}
        next_steps = env.get_next_states()
        # εグリーディー的なやつ
        #epsilon = opt.final_epsilon + (max(opt.num_decay_epochs - epoch, 0) * (  #num_decay_epochs以降一定
        #        opt.initial_epsilon - opt.final_epsilon) / opt.num_decay_epochs)
        epsilon = opt.initial_epsilon - opt.initial_epsilon * epoch / opt.num_epochs  #直線
        u = random()  # 0～1
        random_action = u <= epsilon  # True, False

        next_actions, next_states = zip(
            *next_steps.items())  #next_stepsのkeyとvalueを取得　#( , )*n
        next_states = torch.stack(next_states)  # tensor([[ , , , ],*n])
        if torch.cuda.is_available():
            next_states = next_states.cuda()
        model.eval()
        with torch.no_grad():
            predictions = model(
                next_states
            )[:, 0]  #DeepQNetworkのforward #tensor([,～,])これはそれぞれの行動に対するQ値のようなもの
        model.train()
        # next_stepsのインデックスをランダムor最適で指定
        if random_action:  # ランダムな行動
            index = randint(0, len(next_steps) - 1)
        else:  # 最適な行動(最大のpredictionsに基づく)
            index = torch.argmax(predictions).item()

        # 行動と次の状態を決定
        next_state = next_states[
            index, :]  #ある行動を選択したときの次の状態 #tensor([ , , , ])
        action = next_actions[index]  #行動 #(左から何番目か,何回転か)

        reward, done = env.step(
            action, epoch, render=False)  #行動を実行、報酬(スコア)を求める、溢れた場合done=True、描画

        if torch.cuda.is_available():
            next_state = next_state.cuda()
        replay_memory.append(
            [state, reward, next_state, done]
        )  #deque([[tensor([0., 0., 0., 0.]), 1, tensor([0., 0., 2., 4.]), False]],..., maxlen=30000)

        if done:  # 溢れた場合 or 上限100手
            final_score = env.score
            final_tetrominoes = env.tetrominoes
            final_cleared_lines = env.cleared_lines
            cleared_lines1 = env.cleared_lines1
            cleared_lines2 = env.cleared_lines2
            cleared_lines3 = env.cleared_lines3
            cleared_lines4 = env.cleared_lines4
            state = env.reset()  # 初期状態　tensor([0., 0., 0., 0.])
            if torch.cuda.is_available():
                state = state.cuda()
        else:  # 溢れてない場合
            state = next_state  # 状態を更新  tensor([0., 1., 2., 5.])とか
            continue  #while epoch～に戻る
        #if len(replay_memory) < opt.replay_memory_size / 1000:  #溢れた場合判定(累計ピースが3000以下ならcontinue)
        #continue  #pass
        # 累計ピースが3000に到達した後、溢れる毎に以下を実行
        epoch += 1
        batch = sample(
            replay_memory, min(len(replay_memory), opt.batch_size)
        )  #replay_memoryからbatch_size個ランダムに取り出す(len(replay_memory) < opt.batch_sizeのときはlen(replay_memory)個取り出す)
        replay_memory.clear()  #中身を全消去

        state_batch, reward_batch, next_state_batch, done_batch = zip(*batch)
        state_batch = torch.stack(
            tuple(state for state in
                  state_batch))  #tensor([[0., 26., 16., 62.],*batch_size個])
        reward_batch = torch.from_numpy(
            np.array(reward_batch,
                     dtype=np.float32)[:, None])  #tensor([[1.],*batch_size個])
        next_state_batch = torch.stack(
            tuple(
                state for state in
                next_state_batch))  #tensor([[0., 32., 13., 72.],*batch_size個])

        if torch.cuda.is_available():
            state_batch = state_batch.cuda()
            reward_batch = reward_batch.cuda()
            next_state_batch = next_state_batch.cuda()

        q_values = model(
            state_batch)  #予測Q値、q_values=tensor([[0.1810],*batch_size個])
        model.eval()
        with torch.no_grad():
            next_prediction_batch = model(next_state_batch)  #次の状態に対する予測Q値
        model.train()
        # Q値の正解値を更新式で求める
        y_batch = torch.cat(
            tuple(reward if done else reward + opt.gamma * prediction
                  for reward, done, prediction in zip(
                      reward_batch, done_batch, next_prediction_batch)))[:,
                                                                         None]

        optimizer.zero_grad()  #最適化アルゴリズム
        loss = criterion(q_values, y_batch)  #損失関数はmse、q_values:予測値、y_batch:正解値
        """
        length = len(q_values)
        errors = np.zeros([length])
        print('size', len(q_values), len(y_batch))
        for i in range(length):
            print('Q', q_values[i])
            print('Y', y_batch[i])
            errors[i] = (q_values[i] - y_batch[i]) ** 2
        error = np.mean(errors)
        print('error', error)
        print('loss',loss)
        """
        loss.backward()
        optimizer.step()

        if epoch % 10 == 0:
            print(
                "Epoch: {}/{}, Action: {}, Score: {}, Tetrominoes {}, Cleared lines: {}"
                .format(epoch, opt.num_epochs, action, final_score,
                        final_tetrominoes, final_cleared_lines))
        #学習中のスコアをcsvに記録
        if epoch == 1:
            with open('Score_train.csv', mode='w',
                      newline="") as Score_train_Record:
                writer = csv.writer(Score_train_Record)
                writer.writerow([
                    epoch, final_tetrominoes, final_score, final_cleared_lines,
                    cleared_lines1, cleared_lines2, cleared_lines3,
                    cleared_lines4
                ])
        else:
            with open('Score_train.csv', mode='a',
                      newline="") as Score_train_Record:
                writer = csv.writer(Score_train_Record)
                writer.writerow([
                    epoch, final_tetrominoes, final_score, final_cleared_lines,
                    cleared_lines1, cleared_lines2, cleared_lines3,
                    cleared_lines4
                ])
        """
        writer.add_scalar('Train/Score', final_score, epoch - 1)
        writer.add_scalar('Train/Tetrominoes', final_tetrominoes, epoch - 1)
        writer.add_scalar('Train/Cleared lines', final_cleared_lines, epoch - 1)
        """

        if epoch > 0 and epoch % opt.save_interval == 0:
            torch.save(model, "{}/tetris2_{}".format(
                opt.saved_path, epoch))  #定期的にモデルをtrained_modelsに保存

        if final_tetrominoes > 500:  #ミノ数が500を超えたモデルの重みとバイアスをcsvに保存
            save_model_parameter(model)

    torch.save(model,
               "{}/tetris2".format(opt.saved_path))  #学習後のモデルをtrained_modelsに保存

コード例 #2

ファイルを表示

def train(opt):
    if torch.cuda.is_available():
        torch.cuda.manual_seed(123)
    else:
        torch.manual_seed(123)
    model = DeepQNetwork()
    model_target = DeepQNetwork()
    if os.path.isdir(opt.log_path):
        shutil.rmtree(opt.log_path)
    os.makedirs(opt.log_path)
    writer = SummaryWriter(opt.log_path)
    optimizer = torch.optim.Adam(model.parameters(), lr=1e-6)
    criterion = nn.MSELoss()
    game_state = FlappyBird()
    image, reward, terminal, score = game_state.next_frame(0)
    image = pre_processing(
        image[:game_state.screen_width, :int(game_state.base_y)],
        opt.image_size, opt.image_size)
    image = torch.from_numpy(image)
    if torch.cuda.is_available():
        model.cuda()
        model_target.cuda()
        image = image.cuda()
    state = torch.cat(tuple(image for _ in range(4)))[None, :, :, :]
    model_target.eval()
    replay_memory = []
    iter = 0
    while iter < opt.num_iters:
        prediction = model(state)[0]
        # Exploration or exploitation
        epsilon = opt.final_epsilon + (
            (opt.num_iters - iter) *
            (opt.initial_epsilon - opt.final_epsilon) / opt.num_iters)
        u = random()
        random_action = u <= epsilon
        if random_action:
            #print("Perform a random action")
            action = randint(0, 1)
        else:
            action = torch.argmax(prediction).item()

        next_image, reward, terminal, score = game_state.next_frame(action)
        next_image = pre_processing(
            next_image[:game_state.screen_width, :int(game_state.base_y)],
            opt.image_size, opt.image_size)
        next_image = torch.from_numpy(next_image)
        if torch.cuda.is_available():
            next_image = next_image.cuda()
        next_state = torch.cat((state[0, 1:, :, :], next_image))[None, :, :, :]
        replay_memory.append([state, action, reward, next_state, terminal])
        if len(replay_memory) > opt.replay_memory_size:
            del replay_memory[0]
        batch = sample(replay_memory, min(len(replay_memory), opt.batch_size))
        state_batch, action_batch, reward_batch, next_state_batch, terminal_batch = zip(
            *batch)

        state_batch = torch.cat(tuple(state for state in state_batch))
        action_batch = torch.from_numpy(
            np.array([[1, 0] if action == 0 else [0, 1]
                      for action in action_batch],
                     dtype=np.float32))
        reward_batch = torch.from_numpy(
            np.array(reward_batch, dtype=np.float32)[:, None])
        next_state_batch = torch.cat(tuple(state
                                           for state in next_state_batch))

        if torch.cuda.is_available():
            state_batch = state_batch.cuda()
            action_batch = action_batch.cuda()
            reward_batch = reward_batch.cuda()
            next_state_batch = next_state_batch.cuda()
        current_prediction_batch = model(state_batch)
        next_prediction_batch = model_target(next_state_batch)

        y_batch = torch.cat(
            tuple(reward if terminal else reward +
                  opt.gamma * prediction[max_action]
                  for reward, terminal, prediction, max_action in zip(
                      reward_batch, terminal_batch, next_prediction_batch,
                      torch.argmax(model(next_state_batch), axis=1))))

        q_value = torch.sum(current_prediction_batch * action_batch, dim=1)
        optimizer.zero_grad()
        # y_batch = y_batch.detach()
        loss = criterion(q_value, y_batch)
        loss.backward()
        optimizer.step()

        state = next_state

        if iter % opt.target_update_freq == 0:
            model_target.load_state_dict(model.state_dict())

        iter += 1
        if iter % 100 == 0:
            print(
                "Test::Double Q: Iteration: {}/{}, Action: {}, Loss: {}, Epsilon {}, Reward: {}, Q-value: {}"
                .format(iter + 1, opt.num_iters, action, loss, epsilon, reward,
                        torch.max(prediction)))
        writer.add_scalar('Train/Loss', loss, iter)
        writer.add_scalar('Train/Epsilon', epsilon, iter)
        writer.add_scalar('Train/Reward', reward, iter)
        writer.add_scalar('Train/Q-value', torch.max(prediction), iter)
        writer.add_scalar('Train/score', score, iter)
        if (iter + 1) % 1000000 == 0:
            torch.save(model,
                       "{}/flappy_bird_{}".format(opt.saved_path, iter + 1))
    torch.save(model, "{}/flappy_bird".format(opt.saved_path))

コード例 #3

ファイルを表示

ファイル: train.py プロジェクト: lihow/Reinforcement-Learning-PyTorch

def train(opt):
    if torch.cuda.is_available():
        # 随机数种子seed确定时，模型的训练结果将始终保持一致
        torch.cuda.manual_seed(123)
    else:
        torch.manual_seed(123)
    if os.path.isdir(opt.log_path):
        shutil.rmtree(opt.log_path)
    os.makedirs(opt.log_path)
    writer = SummaryWriter(opt.log_path)
    env = Tetris(width=opt.width, height=opt.height, block_size=opt.block_size)
    model = DeepQNetwork()
    optimizer = torch.optim.Adam(model.parameters(), lr=opt.lr)
    criterion = nn.MSELoss()

    state = env.reset()
    if torch.cuda.is_available():
        model.cuda()
        state = state.cuda()

    replay_memory = deque(maxlen=opt.replay_memory_size)
    epoch = 0
    while epoch < opt.num_epochs:
        # 得到所有可能的下落方块
        next_steps = env.get_next_states()
        # Exploration or exploitation
        epsilon = opt.final_epsilon + (
            max(opt.num_decay_epochs - epoch, 0) *
            (opt.initial_epsilon - opt.final_epsilon) / opt.num_decay_epochs)
        u = random()
        random_action = u <= epsilon
        # 下一步落下的横向坐标以及旋转，以及得到下方方块的board状态
        next_actions, next_states = zip(*next_steps.items())
        next_states = torch.stack(next_states)
        if torch.cuda.is_available():
            next_states = next_states.cuda()
        model.eval()
        with torch.no_grad():
            predictions = model(next_states)[:, 0]
        model.train()
        # 采取的动作
        if random_action:
            index = randint(0, len(next_steps) - 1)
        else:
            index = torch.argmax(predictions).item()

        next_state = next_states[index, :]
        action = next_actions[index]

        reward, done = env.step(action, render=False)

        if torch.cuda.is_available():
            next_state = next_state.cuda()
        replay_memory.append([state, reward, next_state, done])
        if done:
            final_score = env.score
            final_tetrominoes = env.tetrominoes
            final_cleared_lines = env.cleared_lines
            state = env.reset()
            if torch.cuda.is_available():
                state = state.cuda()
        else:
            state = next_state
            continue
        if len(replay_memory) < opt.replay_memory_size / 10:
            continue
        epoch += 1
        batch = sample(replay_memory, min(len(replay_memory), opt.batch_size))
        '''
        a = [2, 3, 4], b = [5, 6, 7], c = [a, b]
        e, f, g = zip(*c)
        e = (2, 5), f = (3, 6), g = (4, 7) 类型为tuple
        '''
        state_batch, reward_batch, next_state_batch, done_batch = zip(*batch)
        state_batch = torch.stack(tuple(state for state in state_batch))
        reward_batch = torch.from_numpy(
            np.array(reward_batch, dtype=np.float32)[:, None])
        next_state_batch = torch.stack(
            tuple(state for state in next_state_batch))

        if torch.cuda.is_available():
            state_batch = state_batch.cuda()
            reward_batch = reward_batch.cuda()
            next_state_batch = next_state_batch.cuda()

        q_values = model(state_batch)
        model.eval()
        # Q_target
        with torch.no_grad():
            next_prediction_batch = model(next_state_batch)
        model.train()

        y_batch = torch.cat(
            tuple(reward if done else reward + opt.gamma * prediction
                  for reward, done, prediction in zip(
                      reward_batch, done_batch, next_prediction_batch)))[:,
                                                                         None]

        optimizer.zero_grad()
        loss = criterion(q_values, y_batch)
        loss.backward()
        optimizer.step()

        print(
            "Epoch: {}/{}, Action: {}, Score: {}, Tetrominoes {}, Cleared lines: {}"
            .format(epoch, opt.num_epochs, action, final_score,
                    final_tetrominoes, final_cleared_lines))
        writer.add_scalar('Train/Score', final_score, epoch - 1)
        writer.add_scalar('Train/Tetrominoes', final_tetrominoes, epoch - 1)
        writer.add_scalar('Train/Cleared lines', final_cleared_lines,
                          epoch - 1)

        if epoch > 0 and epoch % opt.save_interval == 0:
            torch.save(model, "{}/tetris_{}".format(opt.saved_path, epoch))

    torch.save(model, "{}/tetris".format(opt.saved_path))

コード例 #4

ファイルを表示

def test(opt, conv1, conv2, conv3):
    if torch.cuda.is_available():
        torch.cuda.manual_seed(123)
    else:
        torch.manual_seed(123)
    # if torch.cuda.is_available():
    #     model = torch.load("{}/tetris".format(opt.saved_path))
    # else:
    #     model = torch.load("{}/tetris".format(opt.saved_path), map_location=lambda storage, loc: storage)

    model = DeepQNetwork()
    model.eval()
    if False:  # save weights
        ii = 1
        for layer in model.modules():
            if isinstance(layer, nn.Linear):
                if ii == 1:
                    weights1 = layer.weight.cpu()
                    weights1 = weights1.detach().numpy()
                    pd.DataFrame(weights1).to_csv(
                        'trained_models/conv{}.csv'.format(ii))
                if ii == 2:
                    weights2 = layer.weight.cpu()
                    weights2 = weights2.detach().numpy()
                    pd.DataFrame(weights2).to_csv(
                        'trained_models/conv{}.csv'.format(ii))
                if ii == 3:
                    weights3 = layer.weight.cpu()
                    weights3 = weights3.detach().numpy()
                    pd.DataFrame(weights3).to_csv(
                        'trained_models/conv{}.csv'.format(ii))
                ii += 1
    if False:  # load csv weights
        ii = 1
        for layer in model.modules():
            if isinstance(layer, nn.Linear):
                with torch.no_grad():
                    if ii == 1:
                        layer.weight.data = torch.Tensor(conv1).cuda()
                    if ii == 2:
                        layer.weight.data = torch.Tensor(conv2).cuda()
                    if ii == 3:
                        layer.weight.data = torch.Tensor(conv1).cuda()
                    ii += 1

    env = Tetris(width=opt.width, height=opt.height, block_size=opt.block_size)
    env.reset()
    if torch.cuda.is_available():
        model.cuda()
    out = cv2.VideoWriter(
        opt.output, cv2.VideoWriter_fourcc(*"MJPG"), opt.fps,
        (int(1.5 * opt.width * opt.block_size), opt.height * opt.block_size))
    while True:
        next_steps = env.get_next_states()
        next_actions, next_states = zip(*next_steps.items())
        next_states = torch.stack(next_states)
        if torch.cuda.is_available():
            next_states = next_states.cuda()
        predictions = model(next_states)[:, 0]
        index = torch.argmax(predictions).item()
        action = next_actions[index]
        result, done = env.step(action, render=True, video=out)

        if done:
            out.release()
            return result

コード例 #5

ファイルを表示

def train(opt):

    if torch.cuda.is_available():
        torch.cuda.manual_seed(123)
    else:
        torch.manual_seed(123)

    # TensorBoard
    if os.path.isdir(opt.log_path):
        shutil.rmtree(opt.log_path)

    os.makedirs(opt.log_path)
    writer = SummaryWriter(opt.log_path)
    
    # Modelo
    CHECKPOINT_FILE = opt.saved_path + "/" + opt.checkpoint_name

    if opt.load:
        if os.path.isfile(CHECKPOINT_FILE):
            print("--> Carregando Checkpoint '{}'.".format(CHECKPOINT_FILE))
            
            if torch.cuda.is_available():
                model = torch.load(CHECKPOINT_FILE)
            else:
                model = torch.load(CHECKPOINT_FILE, map_location=lambda storage, loc: storage)
            
            print("--> Checkpoint Carregado '{}'.".format(CHECKPOINT_FILE))

        else:
            print("--> Checkpoint '{}' não encontrado.".format(CHECKPOINT_FILE))
            model = DeepQNetwork()
    else:
        model = DeepQNetwork()

    optimizer = torch.optim.Adam(model.parameters(), lr=opt.lr)
    criterion = nn.MSELoss()

    # Environment
    env = Tetris(width=opt.width, height=opt.height)

    state = env.reset()
    if torch.cuda.is_available():
        model.cuda()
        state = state.cuda()

    replay_memory = deque(maxlen=opt.replay_memory_size)
    epoch = 0
    prev_loss = 0

    # Épocas do Checkpoint
    if opt.load and "_" in opt.checkpoint_name:
        start_epoch = opt.checkpoint_name.split("_")[-1]
        epoch = int(start_epoch)
        print("Checkpoint com {} épocas.".format(epoch))


    # Loop de Treino
    while epoch < opt.num_epochs:
        next_steps = env.get_next_states()
        
        # Exploração ou Explotação
        epsilon = opt.final_epsilon + (max(opt.num_decay_epochs - epoch, 0) * (
                opt.initial_epsilon - opt.final_epsilon) / opt.num_decay_epochs)
        u = random()
        random_action = u <= epsilon
        next_actions, next_states = zip(*next_steps.items())
        next_states = torch.stack(next_states)

        if torch.cuda.is_available():
            next_states = next_states.cuda()

        model.eval()
        with torch.no_grad():
            predictions = model(next_states)[:, 0]

        model.train()
        if random_action:
            index = randint(0, len(next_steps) - 1)
        else:
            index = torch.argmax(predictions).item()

        next_state = next_states[index, :]
        action = next_actions[index]

        reward, done = env.step(action, render=True)

        if torch.cuda.is_available():
            next_state = next_state.cuda()

        replay_memory.append([state, reward, next_state, done])

        if done:
            final_score = env.score
            final_tetrominoes = env.tetrominoes
            final_cleared_lines = env.cleared_lines
            state = env.reset()
            if torch.cuda.is_available():
                state = state.cuda()
        else:
            state = next_state
            continue

        # Replay Buffer
        if len(replay_memory) < opt.replay_memory_size / 10:
            print("replay_memory ", len(replay_memory))
            continue
        
        epoch += 1
        batch = sample(replay_memory, min(len(replay_memory), opt.batch_size))
        state_batch, reward_batch, next_state_batch, done_batch = zip(*batch)
        state_batch = torch.stack(tuple(state for state in state_batch))
        reward_batch = torch.from_numpy(np.array(reward_batch, dtype=np.float32)[:, None])
        next_state_batch = torch.stack(tuple(state for state in next_state_batch))

        # Aprendizado
        if torch.cuda.is_available():
            state_batch = state_batch.cuda()
            reward_batch = reward_batch.cuda()
            next_state_batch = next_state_batch.cuda()

        q_values = model(state_batch)
        model.eval()
        with torch.no_grad():
            next_prediction_batch = model(next_state_batch)
        model.train()

        y_batch = torch.cat(
            tuple(reward if done else reward + opt.gamma * prediction for reward, done, prediction in
                  zip(reward_batch, done_batch, next_prediction_batch)))[:, None]

        optimizer.zero_grad()
        loss = criterion(q_values, y_batch)
        loss.backward()
        optimizer.step()

        prev_loss = loss.item()

        print("Epoch: {}/{}, Action: {}, Score: {}, Tetrominoes {}, Cleared lines: {}".format(
            epoch,
            opt.num_epochs,
            action,
            final_score,
            final_tetrominoes,
            final_cleared_lines))
        writer.add_scalar('Train/Score', final_score, epoch - 1)
        writer.add_scalar('Train/Tetrominoes', final_tetrominoes, epoch - 1)
        writer.add_scalar('Train/Cleared lines', final_cleared_lines, epoch - 1)

        if epoch > 0 and epoch % opt.save_interval == 0:
            torch.save(model, "{}/{}_{}".format(opt.saved_path, opt.saved_name, epoch))

    torch.save(model, "{}/{}".format(opt.saved_path, opt.saved_name))

コード例 #6

ファイルを表示

def train(opt):
    cv2.setUseOptimized(True)
    print("cv2 is optimized =", cv2.useOptimized())
    print("cuda available =", torch.cuda.is_available())
    if torch.cuda.is_available():
        torch.cuda.manual_seed(123)
    else:
        torch.manual_seed(123)
    if os.path.isdir(opt.log_path):
        shutil.rmtree(opt.log_path)
    os.makedirs(opt.log_path)
    writer = SummaryWriter(opt.log_path)
    
    #############
    # The environment in this situation is the Tetris game itself
    # Create a Tetris object with the constructor in src/tetris.py and pass in the 3 arguments described there
    # the arguments default values are stored in "opt" which comes from the argument parser above 
    #############
    env = #.....
    

    model = DeepQNetwork()
    optimizer = torch.optim.Adam(model.parameters(), lr=opt.lr)
    
    
    criterion = nn.MSELoss()

    state = env.reset()
    if torch.cuda.is_available():
        model.cuda()
        state = state.cuda()

    replay_memory = deque(maxlen=opt.replay_memory_size)
    epoch = 0
    while epoch < opt.num_epochs:
        next_steps = env.get_next_states()
        # Exploration or exploitation
        
        #############
        # epsilon = A threshhold that decides how likely it is that a random action is performed. also often calles "eps_threshold"
        # Insert a function that creates a decaying value per epoch, between 1 and 0. amount of epochs is defined as input, and default in parser(top of this file)
        # The best results will probably come from a function that decays in the first x epochs and is = "some final low value, e.g. 0.001" for the last y epochs
        #############
        epsilon = #.....       
        
        #############
        # u = a random number between 0 and 1 that randomly decides whether a random action is performed or an action chosen by the model
        #############
        u = #.....
        random_action = u <= epsilon 

    
        next_actions, next_states = zip(*next_steps.items())
        next_states = torch.stack(next_states)
        
        # if cuda is available, move network to GPU 
        if torch.cuda.is_available():
            next_states = next_states.cuda()
        
        model.#......      evaluate model, built in function
        
        with torch.no_grad():
            predictions = model(next_states)[:, 0]
        
        model.train()

        if random_action:
            index = randint(0, len(next_steps) - 1)
        else:
            index = torch.argmax(predictions).item()

        next_state = next_states[index, :]
        action = next_actions[index]

        #############
        # reward = metric of how well the model has performed
        # done = whether or not the model is finished with the current game 
        # a function that outputs both of these state variables can be found in src/tetris.py
        # remeber to pass in the second argument as well, otherwise all training will be done with visualization which is cool to watch, but slow
        #############
        reward, done = env.#.....

        if torch.cuda.is_available():
            next_state = next_state.cuda()
        replay_memory.append([state, reward, next_state, done])
        if done:
            final_score = env.score
            final_tetrominoes = env.tetrominoes
            final_cleared_lines = env.cleared_lines
            state = env.reset()
            if torch.cuda.is_available():
                state = state.cuda()
        else:
            state = next_state
            continue
        if len(replay_memory) < opt.replay_memory_size / 10:
            continue
        
        #..... Increment epoch counter

        batch = sample(replay_memory, min(len(replay_memory), opt.batch_size))
        state_batch, reward_batch, next_state_batch, done_batch = zip(*batch)
        state_batch = torch.stack(tuple(state for state in state_batch))
        reward_batch = torch.from_numpy(np.array(reward_batch, dtype=np.float32)[:, None])
        next_state_batch = torch.stack(tuple(state for state in next_state_batch))

        # if cuda is available, move network to GPU 
        if torch.cuda.is_available():
            state_batch = state_batch.cuda()
            reward_batch = reward_batch.cuda()
            next_state_batch = next_state_batch.cuda()

        q_values = model(state_batch)

        model.#......      evaluate model, built in function

        with torch.no_grad():
            next_prediction_batch = model(next_state_batch)
        model.train()

        y_batch = torch.cat(
            tuple(reward if done else reward + opt.gamma * prediction for reward, done, prediction in
                  zip(reward_batch, done_batch, next_prediction_batch)))[:, None]

        
        #############
        # Optimizers are algorithms or methods used to change the attributes of your neural network such as weights and learning rate in order to reduce the losses
        # 
        # In order to ensure that behaviour learned from one epoch is not used in multiple epochs to tune a network(accumulating gradients), 
        # we need to set all gradients to zero, this is a built in function 
        #############
        optimizer.#.....
        
        #############
        # A criterion is a loss function that compares two inputs as tensors(pytorch datatype)
        # 
        # Pass in two tensors into our criterion(loss function, set to MSELoss())
        #############
        loss = criterion()#(tensorX, tensorY)
        
        #############
        # The criterion outputs a loss object that stores the loss value and enables us to tune our neural network by backpropagation
        # 
        # The loss function has access to a built in function that does this
        #############
        loss.#.....

        #############
        # Optimizers are algorithms or methods used to change the attributes of your neural network such as weights and learning rate in order to reduce the losses
        # 
        # The optimizer should her e perform a parameter update based on the current gradient, and does this by a built in function
        #############
        optimizer.#.....

        print("Epoch: {}/{}, Action: {}, Score: {}, Tetrominoes {}, Cleared lines: {}".format(
            epoch,
            opt.num_epochs,
            action,
            final_score,
            final_tetrominoes,
            final_cleared_lines))
        writer.add_scalar('Train/Score', final_score, epoch - 1)
        writer.add_scalar('Train/Tetrominoes', final_tetrominoes, epoch - 1)
        writer.add_scalar('Train/Cleared lines', final_cleared_lines, epoch - 1)

        if epoch > 0 and epoch % opt.save_interval == 0:
            torch.save(model, "{}/tetris_{}".format(opt.saved_path, epoch))

    #############
    # PyTorch offers a way of saving either full models or a model state's weights as a file
    # 
    # Search on pytorch.org for this save function
    #############
    torch.#.....

コード例 #7

ファイルを表示

ファイル: train.py プロジェクト: martinzwm/tetris-ai

def train(opt):
    if torch.cuda.is_available():
        torch.cuda.manual_seed(123)
    else:
        torch.manual_seed(123)
    if os.path.isdir(opt.log_path):
        shutil.rmtree(opt.log_path)
    os.makedirs(opt.log_path)
    writer = SummaryWriter(opt.log_path)
    env = Tetris(width=opt.width, height=opt.height, block_size=opt.block_size)
    model = DeepQNetwork()
    model_target = DeepQNetwork()
    optimizer = torch.optim.Adam(model.parameters(), lr=opt.lr)
    criterion = nn.MSELoss()

    state = env.reset()
    if torch.cuda.is_available():
        model.cuda()
        model_target.cuda()
        state = state.cuda()

    if opt.PER:
        replay_memory = Memory(capacity=opt.replay_memory_size)
    else:
        replay_memory = deque(maxlen=opt.replay_memory_size)

    epoch = 0
    warmup_epoch = 0
    while epoch < opt.num_epochs:
        next_steps = env.get_next_states()
        # Exploration or exploitation
        epsilon = opt.final_epsilon + (
            max(opt.num_decay_epochs - epoch, 0) *
            (opt.initial_epsilon - opt.final_epsilon) / opt.num_decay_epochs)
        u = random()
        random_action = u <= epsilon
        next_actions, next_states = zip(*next_steps.items())
        next_states = torch.stack(next_states)
        if torch.cuda.is_available():
            next_states = next_states.cuda()
        model.eval()
        with torch.no_grad():
            predictions = model(next_states)[:, 0]
        model.train()
        if random_action:
            index = randint(0, len(next_steps) - 1)
        else:
            index = torch.argmax(predictions).item()

        next_state = next_states[index, :]
        action = next_actions[index]

        reward, done = env.step(action, render=True)

        if torch.cuda.is_available():
            next_state = next_state.cuda()

        if opt.PER:
            experience = state, action, reward, next_state, done
            replay_memory.store(experience)
        else:
            replay_memory.append([state, reward, next_state, done])

        if done:
            final_score = env.score
            final_tetrominoes = env.tetrominoes
            final_cleared_lines = env.cleared_lines
            state = env.reset()
            if torch.cuda.is_available():
                state = state.cuda()
        else:
            state = next_state
            continue
        warmup_epoch += 1
        if warmup_epoch < opt.learning_starts:
            continue
        epoch += 1

        if opt.PER:
            tree_idx, batch = replay_memory.sample(opt.batch_size)
        else:
            batch = sample(replay_memory,
                           min(len(replay_memory), opt.batch_size))

        state_batch, _, reward_batch, next_state_batch, done_batch = zip(
            *batch)
        state_batch = torch.stack(tuple(state for state in state_batch))
        reward_batch = torch.from_numpy(
            np.array(reward_batch, dtype=np.float32)[:, None])
        next_state_batch = torch.stack(
            tuple(state for state in next_state_batch))

        if torch.cuda.is_available():
            state_batch = state_batch.cuda()
            reward_batch = reward_batch.cuda()
            next_state_batch = next_state_batch.cuda()

        q_values = model(state_batch)
        model_target.eval()
        with torch.no_grad():
            next_prediction_batch = model_target(next_state_batch)
        model_target.train()

        y_batch = torch.cat(
            tuple(reward if done else reward + opt.gamma * prediction
                  for reward, done, prediction in zip(
                      reward_batch, done_batch, next_prediction_batch)))[:,
                                                                         None]

        optimizer.zero_grad()
        loss = criterion(q_values, y_batch)
        loss.backward()
        optimizer.step()

        model.eval()
        model_target.eval()
        if opt.PER:
            with torch.no_grad():
                if torch.cuda.is_available():
                    replay_memory.batch_update(
                        tree_idx,
                        np.abs(q_values.detach().cpu().numpy() -
                               y_batch.cpu().numpy()))
                else:
                    replay_memory.batch_update(
                        tree_idx,
                        np.abs(q_values.detach().numpy() - y_batch.numpy()))

        # Update target model <- model
        if epoch % opt.target_update_freq == 0:
            with torch.no_grad():
                model_target.load_state_dict(model.state_dict())
        model_target.train()
        model.eval()

        print(
            "Epoch: {}/{}, Action: {}, Score: {}, Tetrominoes {}, Cleared lines: {}"
            .format(epoch, opt.num_epochs, action, final_score,
                    final_tetrominoes, final_cleared_lines))
        writer.add_scalar('Train/Score', final_score, epoch - 1)
        writer.add_scalar('Train/Tetrominoes', final_tetrominoes, epoch - 1)
        writer.add_scalar('Train/Cleared lines', final_cleared_lines,
                          epoch - 1)

        if (epoch > 0
                and epoch % opt.save_interval) == 0 or final_score >= 10000.0:
            torch.save(model, "{}/tetris_{}".format(opt.saved_path, epoch))

    torch.save(model, "{}/tetris".format(opt.saved_path))

コード例 #8

ファイルを表示

def train(opt):
    # Set random seed
    if torch.cuda.is_available():
        torch.cuda.manual_seed(opt.random_seed)
    else:
        torch.manual_seed(opt.random_seed)
    # Instantiate the model
    if opt.conv_dim is not None and \
       opt.conv_kernel_sizes is not None and \
       opt.conv_strides is not None and \
       opt.fc_dim is not None:
        model = DeepQNetwork(opt.image_size,
                             opt.image_size,
                             conv_dim=opt.conv_dim,
                             conv_kernel_sizes=opt.conv_kernel_sizes,
                             conv_strides=opt.conv_strides,
                             fc_dim=opt.fc_dim)
    else:
        model = DeepQNetwork(opt.image_size, opt.image_size)

    if opt.log_comet_ml:
        # Create a Comet.ml experiment
        experiment = Experiment(api_key=opt.comet_ml_api_key,
                                project_name=opt.comet_ml_project_name,
                                workspace=opt.comet_ml_workspace)
        experiment.log_other("iters_to_save", opt.iters_to_save)
        experiment.log_other("completed", False)
        experiment.log_other("random_seed", opt.random_seed)

        # Report hyperparameters to Comet.ml
        hyper_params = {
            "image_size": opt.image_size,
            "batch_size": opt.batch_size,
            "optimizer": opt.optimizer,
            "learning_rate": opt.lr,
            "gamma": opt.gamma,
            "initial_epsilon": opt.initial_epsilon,
            "final_epsilon": opt.final_epsilon,
            "num_iters": opt.num_iters,
            "replay_memory_size": opt.replay_memory_size,
            "random_seed": opt.random_seed,
            "conv_dim": opt.conv_dim,
            "conv_kernel_sizes": opt.conv_kernel_sizes,
            "conv_strides": opt.conv_strides,
            "fc_dim": opt.fc_dim
        }
        experiment.log_parameters(hyper_params)

    optimizer = torch.optim.Adam(model.parameters(),
                                 lr=1e-6)  # Optimization algorithm
    criterion = nn.MSELoss()  # Loss function
    game_state = FlappyBird()  # Instantiate the Flappy Compass game
    image, reward, terminal = game_state.next_frame(
        0
    )  # Get the next image, along with its reward and an indication if it's a terminal state

    # Image preprocessing step (scaling, color removal and convertion to a PyTorch tensor)
    image = pre_processing(
        image[:game_state.screen_width, :int(game_state.base_y)],
        opt.image_size, opt.image_size)
    image = torch.from_numpy(image)

    # Move the model and the current image data to the GPU, if available
    if torch.cuda.is_available():
        model.cuda()
        image = image.cuda()

    # Prepare the state variable, which will host the last 4 frames
    state = torch.cat(tuple(image for _ in range(4)))[None, :, :, :]

    # Initialize the replay memory, which saves sets of consecutive game states, the reward and terminal state indicator
    # so that the model can learn from them (essentially constitutes the training data, which grows with every new iteration)
    replay_memory = []

    iter = 0  # Iteration counter

    # Main training loop performing the number of iterations specified by num_iters
    while iter < opt.num_iters:
        prediction = model(state)[0]  # Get a prediction from the current state
        epsilon = opt.final_epsilon + (
            (opt.num_iters - iter) *
            (opt.initial_epsilon - opt.final_epsilon) / opt.num_iters
        )  # Set the decay of the probability of random actions
        u = random()
        random_action = u <= epsilon
        if random_action:
            print("Perform a random action")
            action = randint(0, 1)
        else:
            # Use the model's prediction to decide the next action
            action = torch.argmax(prediction).item()

        # Get a new frame and process it
        next_image, reward, terminal = game_state.next_frame(action)
        next_image = pre_processing(
            next_image[:game_state.screen_width, :int(game_state.base_y)],
            opt.image_size, opt.image_size)
        next_image = torch.from_numpy(next_image)

        # Move the next image data to the GPU, if available
        if torch.cuda.is_available():
            next_image = next_image.cuda()

        next_state = torch.cat(
            (state[0, 1:, :, :], next_image)
        )[None, :, :, :]  # Prepare the next state variable, which will host the last 4 frames
        replay_memory.append(
            [state, action, reward, next_state, terminal]
        )  # Save the current state, action, next state and terminal state indicator in the replay memory
        if len(replay_memory) > opt.replay_memory_size:
            del replay_memory[
                0]  # Delete the oldest reolay from memory if full capacity has been reached
        batch = sample(replay_memory, min(len(replay_memory), opt.batch_size)
                       )  # Retrieve past play sequences from the replay memory
        state_batch, action_batch, reward_batch, next_state_batch, terminal_batch = zip(
            *batch)

        state_batch = torch.cat(tuple(
            state for state in state_batch))  # States of the current batch
        action_batch = torch.from_numpy(
            np.array([[1, 0] if action == 0 else [0, 1]
                      for action in action_batch],
                     dtype=np.float32))  # Actions taken in the current batch
        reward_batch = torch.from_numpy(
            np.array(reward_batch,
                     dtype=np.float32)[:,
                                       None])  # Rewards in the current batch
        next_state_batch = torch.cat(tuple(
            state
            for state in next_state_batch))  # Next states of the current batch

        # Move batch data to the GPU, if available
        if torch.cuda.is_available():
            state_batch = state_batch.cuda()
            action_batch = action_batch.cuda()
            reward_batch = reward_batch.cuda()
            next_state_batch = next_state_batch.cuda()

        current_prediction_batch = model(
            state_batch
        )  # Predictions of the model for the replays of the current batch
        next_prediction_batch = model(
            next_state_batch
        )  # Next predictions of the model for the replays of the current batch

        # Set ground truth for the rewards for the current batch, considering whether the state is terminal or not
        y_batch = torch.cat(
            tuple(reward if terminal else reward +
                  opt.gamma * torch.max(prediction)
                  for reward, terminal, prediction in zip(
                      reward_batch, terminal_batch, next_prediction_batch)))

        q_value = torch.sum(
            current_prediction_batch * action_batch, dim=1
        )  # Predicted Q values (i.e. estimated return for each action)
        optimizer.zero_grad(
        )  # Reset the gradients to zero before a new optimization step
        loss = criterion(q_value, y_batch)  # Calculate the loss
        loss.backward()  # Backpropagation
        optimizer.step()  # Weights optimization step

        state = next_state  # Move to the next frame
        iter += 1
        print(
            "Iteration: {}/{}, Action: {}, Loss: {}, Epsilon {}, Reward: {}, Q-value: {}"
            .format(iter + 1, opt.num_iters, action, loss, epsilon, reward,
                    torch.max(prediction)))

        if opt.log_comet_ml:
            # Log metrics to Comet.ml
            experiment.log_metric("train_loss", loss, step=iter)
            experiment.log_metric("train_epsilon", epsilon, step=iter)
            experiment.log_metric("train_reward", reward, step=iter)
            experiment.log_metric("train_Q_value",
                                  torch.max(prediction),
                                  step=iter)

        if (iter + 1) % opt.iters_to_save == 0:
            # Get the current day and time to attach to the saved model's name
            current_datetime = datetime.now().strftime('%d_%m_%Y_%H_%M')

            # Set saved model name
            model_filename = f'{opt.saved_path}/flappy_compass_{current_datetime}_{iter+1}.pth'

            # Save model every iters_to_save iterations
            torch.save(model, model_filename)

            if opt.log_comet_ml and opt.comet_ml_save_model:
                # Upload model to Comet.ml
                experiment.log_asset(file_path=model_filename, overwrite=True)

    # Get the current day and time to attach to the saved model's name
    current_datetime = datetime.now().strftime('%d_%m_%Y_%H_%M')

    # Set saved model name
    model_filename = f'{opt.saved_path}/flappy_compass_{current_datetime}_{iter+1}.pth'

    # Save the model after reaching the final iteration
    torch.save(model, model_filename)

    if opt.log_comet_ml:
        # Only report that the experiment completed successfully if it finished the training without errors
        experiment.log_other("completed", True)

        if opt.comet_ml_save_model:
            # Upload model to Comet.ml
            experiment.log_asset(file_path=model_filename, overwrite=True)

コード例 #9

ファイルを表示

def train(opt):
    if torch.cuda.is_available():
        torch.cuda.manual_seed(123)
    else:
        torch.manual_seed(123)
    model = DeepQNetwork()
    if os.path.isdir(opt.log_path):
        shutil.rmtree(opt.log_path)
    os.makedirs(opt.log_path)
    writer = SummaryWriter(opt.log_path)
    optimizer = torch.optim.Adam(model.parameters(), lr=1e-6)
    criterion = nn.MSELoss()
    game_state = NS_SHAFT()
    image, reward, terminal = game_state.next_frame(0)
    imgplot = plt.imshow(image)
    image = pre_processing(image[:game_state.screen_width, :int(400)],
                           opt.image_size, opt.image_size)
    image = torch.from_numpy(image)
    if torch.cuda.is_available():
        model.cuda()
        image = image.cuda()
    state = torch.cat(tuple(image for _ in range(4)))[None, :, :, :]

    replay_memory = []
    iter = 0
    while iter < opt.num_iters:
        #plt.plot([1,2,3,4])

        prediction = model(state)
        # Exploration or exploitation
        epsilon = opt.final_epsilon + (
            (opt.num_iters - iter) *
            (opt.initial_epsilon - opt.final_epsilon) / opt.num_iters)
        u = random()
        random_action = u <= epsilon
        if random_action:
            print("Perform a random action")
            action = randint(-1, 1) * 4
        else:
            #print('prediction',prediction)
            action = (prediction.data.max(1)[1].item() - 1) * 4
            #print('a',action)

        next_image, reward, terminal = game_state.next_frame(action)
        next_image = pre_processing(
            next_image[:game_state.screen_width, :int(400)], opt.image_size,
            opt.image_size)
        next_image = torch.from_numpy(next_image)
        if torch.cuda.is_available():
            next_image = next_image.cuda()
        next_state = torch.cat((state[0, 1:, :, :], next_image))[None, :, :, :]
        replay_memory.append([state, action, reward, next_state, terminal])
        if len(replay_memory) > opt.replay_memory_size:
            del replay_memory[0]
        batch = sample(replay_memory, min(len(replay_memory), opt.batch_size))
        state_batch, action_batch, reward_batch, next_state_batch, terminal_batch = zip(
            *batch)

        state_batch = torch.cat(tuple(state for state in state_batch))
        action_batch_tmp = []
        for action in action_batch:
            if action == -4:
                action_batch_tmp.append([1, 0, 0])
            elif action == 0:
                action_batch_tmp.append([0, 1, 0])
            else:
                action_batch_tmp.append([0, 0, 1])
        action_batch = torch.from_numpy(
            np.array(action_batch_tmp, dtype=np.float32))
        reward_batch = torch.from_numpy(
            np.array(reward_batch, dtype=np.float32)[:, None])
        next_state_batch = torch.cat(tuple(state
                                           for state in next_state_batch))

        if torch.cuda.is_available():
            state_batch = state_batch.cuda()
            action_batch = action_batch.cuda()
            reward_batch = reward_batch.cuda()
            next_state_batch = next_state_batch.cuda()
        current_prediction_batch = model(state_batch)
        next_prediction_batch = model(next_state_batch)

        y_batch = torch.cat(
            tuple(reward if terminal else reward +
                  opt.gamma * torch.max(prediction)
                  for reward, terminal, prediction in zip(
                      reward_batch, terminal_batch, next_prediction_batch)))

        q_value = torch.sum(current_prediction_batch * action_batch, dim=1)
        optimizer.zero_grad()
        # y_batch = y_batch.detach()
        loss = criterion(q_value, y_batch)
        loss.backward()
        optimizer.step()

        state = next_state
        iter += 1

        print(
            "Iteration: {}/{}, Action: {}, Loss: {}, Epsilon {}, Reward: {}, Q-value: {}"
            .format(iter + 1, opt.num_iters, action, loss, epsilon, reward,
                    torch.max(prediction)))
        writer.add_scalar('Train/Loss', loss, iter)
        writer.add_scalar('Train/Epsilon', epsilon, iter)
        writer.add_scalar('Train/Reward', reward, iter)
        writer.add_scalar('Train/Q-value', torch.max(prediction), iter)
        if iter + 1 == opt.num_iters:
            torch.save(model, "{}/ns_shaft_{}".format(opt.saved_path,
                                                      iter + 1))
    torch.save(model, "{}/ns_shaft".format(opt.saved_path))

コード例 #10

ファイルを表示

def training(arguments):
    if torch.cuda.is_available():
        torch.cuda.manual_seed(123)
    else:
        torch.manual_seed(123)

    model = DeepQNetwork()
    if os.path.isdir(arguments.log_path):
        shutil.rmtree(arguments.log_path)
    os.makedirs(arguments.log_path)
    writer = SummaryWriter(arguments.log_path)
    optimiser = torch.optim.Adam(model.parameters(), lr=1e-6)
    criterion = nn.MSELoss()
    gameState = Flappyplayer()
    image, reward, terminal = gameState.next_frame(0)
    image = pre_processing(image[:gameState.SCREENW, :int(gameState.base_y)], arguments.image_size,
                           arguments.image_size)
    image = torch.from_numpy(image)
    if torch.cuda.is_available():
        model.cuda()
        image = image.cuda()
    state = torch.cat(tuple(image for _ in range(4)))[None, :, :, :]
    replay_mem = []
    iter = 0
    while iter < arguments.iters:
        prediction = model(state)[0]
        # Exploration or exploitation
        epsilon = arguments.final_epsilon + (
                (arguments.iters - iter) * (arguments.initial_epsilon - arguments.final_epsilon) / arguments.iters)
        u = random()
        random_action = u <= epsilon
        if random_action:
            print("Perform a random action")
            action = 1
            print(action)
        else:
            action = 0

        nextImage, reward, terminal = gameState.next_frame(action)
        nextImage = pre_processing(nextImage[:gameState.SCREENW, :int(gameState.base_y)], arguments.image_size,
                                   arguments.image_size)
        nextImage = torch.from_numpy(nextImage)
        if torch.cuda.is_available():
            nextImage = nextImage.cuda()
        nextState = torch.cat((state[0, 1:, :, :], nextImage))[None, :, :, :]
        replay_mem.append([state, action, reward, nextState, terminal])
        if len(replay_mem) > arguments.replay_mem:
            del replay_mem[0]

        batch = sample(replay_mem, min(len(replay_mem), arguments.batch_size))
        stateBatch, actionBatch, rewardBatch, nextStateBatch, terminalBatch = zip(*batch)

        stateBatch = torch.cat(tuple(state for state in stateBatch))
        actionBatch = torch.from_numpy(np.array([[1,0] if action == 0 else [0,1] for action in actionBatch], dtype=np.float32))
        rewardBatch = torch.from_numpy(np.array(rewardBatch, dtype=np.float32)[:, None])
        nextStateBatch = torch.cat(tuple(state for state in nextStateBatch))

        if torch.cuda.is_available():
            stateBatch = stateBatch.cuda()
            actionBatch = actionBatch.cuda()
            rewardBatch = rewardBatch.cuda()
            nextStateBatch = nextStateBatch.cuda()
        currentPredBatch = model(stateBatch)
        nextPredBatch = model(nextStateBatch)

        yBatch = torch.cat(tuple(reward if terminal else reward + arguments.gamma * torch.max(prediction) for reward, terminal, prediction in zip(rewardBatch, terminalBatch, nextPredBatch)))
        qValue = torch.sum(currentPredBatch*actionBatch, dim=1)
        optimiser.zero_grad()
        loss = criterion(qValue, yBatch)
        loss.backward()
        optimiser.step()

        state = nextState
        iter +=1
        print("Iteration: {}/{}, Action: {}, Loss: {}, Epsilon: {}, Reward: {}, Q-Value: {}".format(iter+1,arguments.iters, action, loss, epsilon, reward, torch.max(prediction)))
        writer.add_scalar('Train/Loss', loss, iter)
        writer.add_scalar('Train/Epsilon', epsilon, iter)
        writer.add_scalar('Train/Reward', reward, iter)
        writer.add_scalar('Train/Q-Value', torch.max(prediction), iter)
        if (iter+1) % 1000000 == 0:
            torch.save(model, "{}/flappy_bird_{}".format(arguments.saved_path, iter+1))

    torch.save(model, "{}/flappy_bird".format(arguments.saved_path))