Python TEDD1104 Examples

Programming Language: Python

Namespace/Package Name: model

Class/Type: TEDD1104

Examples at hotexamples.com: 8

Python TEDD1104 - 8 examples found. These are the top rated real world Python examples of model.TEDD1104 extracted from open source projects. You can rate examples to help us improve the quality of examples.

Frequently Used Methods

Show Hide

forward(4)

parameters(4)

train(4)

zero_grad(3)

TEDD1104(2)

eval(2)

predict(2)

Example #1

Show file

File: utils.py Project: yangmingwanli/Self-Driving-Car-in-Video-Games

def evaluate(
    model: TEDD1104,
    X: torch.tensor,
    golds: torch.tensor,
    device: torch.device,
    batch_size: int,
) -> float:
    """
    Given a set of input examples and the golds for these examples evaluates the model accuracy
    Input:
     - model: TEDD1104 model to evaluate
     - X: input examples [num_examples, sequence_size, 3, H, W]
     - golds: golds for the input examples [num_examples]
     - device: string, use cuda or cpu
     -batch_size: integer batch size
    Output:
    - Accuracy: float
    """
    model.eval()
    correct = 0
    for X_batch, y_batch in nn_batchs(X, golds, batch_size):
        predictions: np.ndarray = model.predict(X_batch.to(device)).cpu().numpy()
        correct += np.sum(predictions == y_batch)

    return correct / len(golds)

Example #2

Show file

def evaluate(
    model: TEDD1104,
    data_loader: DataLoader,
    device: torch.device,
    fp16: bool,
) -> float:
    """
    Given a set of input examples and the golds for these examples evaluates the model accuracy
    Input:
     - model: TEDD1104 model to evaluate
     - data_loader: torch.utils.data.DataLoader with the examples to evaluate
     - device: string, use cuda or cpu
     -batch_size: integer batch size
    Output:
    - Accuracy: float
    """
    model.eval()
    correct = 0
    total = 0

    for batch in tqdm(data_loader, desc="Evaluating model"):
        x = torch.flatten(
            torch.stack(
                (
                    batch["image1"],
                    batch["image2"],
                    batch["image3"],
                    batch["image4"],
                    batch["image5"],
                ),
                dim=1,
            ),
            start_dim=0,
            end_dim=1,
        ).to(device)

        y = batch["y"]

        if fp16:
            with autocast():
                predictions: np.ndarray = model.predict(x).cpu()
        else:
            predictions: np.ndarray = model.predict(x).cpu()

        correct += (predictions == y).sum().numpy()
        total += len(predictions)

    return correct / total

Example #3

Show file

File: train.py Project: tamlovincy/Self-Driving-Car-in-Video-Games

def train_new_model(
    train_dir="Data\\GTAV-AI\\data-v2\\train\\",
    dev_dir="Data\\GTAV-AI\\data-v2\\dev\\",
    test_dir="Data\\GTAV-AI\\data-v2\\test\\",
    output_dir="Data\\models\\",
    batch_size=10,
    num_epoch=20,
    optimizer_name="SGD",
    resnet: int = 18,
    pretrained_resnet: bool = True,
    sequence_size: int = 5,
    embedded_size: int = 256,
    hidden_size: int = 128,
    num_layers_lstm: int = 1,
    bidirectional_lstm: bool = False,
    layers_out: List[int] = None,
    dropout_cnn: float = 0.1,
    dropout_cnn_out: float = 0.1,
    dropout_lstm: float = 0.1,
    dropout_lstm_out: float = 0.1,
    hide_map_prob: float = 0.0,
    num_load_files_training: int = 5,
    fp16=True,
    apex_opt_level="O2",
    save_checkpoints=True,
    save_every: int = 100,
    save_best=True,
):
    """
    Train a new model

    Input:
    - train_dir: Directory where the train files are stored
    - dev_dir: Directory where the development files are stored
    - test_dir: Directory where the test files are stored
    - output_dir: Directory where the model and the checkpoints are going to be saved
    - batch_size: Batch size (Around 10 for 8GB GPU)
    - num_epochs: Number of epochs to do
    - optimizer_name: Name of the optimizer to use [SGD, Adam]
    - optimizer: Optimizer (torch.optim)
    - resnet: resnet module to use [18,34,50,101,152]
    - pretrained_resnet: Load pretrained resnet weights
    - sequence_size: Length of each series of features
    - embedded_size: Size of the feature vectors
    - hidden_size: LSTM hidden size
    - num_layers_lstm: number of layers in the LSTM
    - bidirectional_lstm: forward or bidirectional LSTM
    - layers_out: list of integer, for each integer i a linear layer with i neurons will be added.
    - dropout_cnn: dropout probability for the CNN layers
    - dropout_cnn_out: dropout probability for the cnn features (output layer)
    - dropout_lstm: dropout probability for the LSTM
    - dropout_lstm_out: dropout probability for the LSTM features (output layer)
    - hide_map_prob: Probability for removing the minimap (black square) from the image (0<=hide_map_prob<=1)
    - fp16: Use FP16 for training
    - amp_opt_level: If FP16 training Nvidia apex opt level
    - save_checkpoints: save a checkpoint each epoch (Each checkpoint will rewrite the previous one)
    - save_best: save the model that achieves the higher accuracy in the development set

    Output:

    """

    if fp16:
        try:
            from apex import amp
        except ImportError:
            raise ImportError(
                "Please install apex from https://www.github.com/nvidia/apex to use fp16 training."
            )

    print("Loading new model")
    model: TEDD1104 = TEDD1104(
        resnet=resnet,
        pretrained_resnet=pretrained_resnet,
        sequence_size=sequence_size,
        embedded_size=embedded_size,
        hidden_size=hidden_size,
        num_layers_lstm=num_layers_lstm,
        bidirectional_lstm=bidirectional_lstm,
        layers_out=layers_out,
        dropout_cnn=dropout_cnn,
        dropout_cnn_out=dropout_cnn_out,
        dropout_lstm=dropout_lstm,
        dropout_lstm_out=dropout_lstm_out,
    ).to(device)

    if optimizer_name == "SGD":
        optimizer = optim.SGD(model.parameters(), lr=0.01, momentum=0.9)
    elif optimizer_name == "Adam":
        optimizer = optim.Adam(model.parameters(), lr=0.001)
    else:
        raise ValueError(
            f"Optimizer {optimizer_name} not implemented. Available optimizers: SGD, Adam"
        )

    if fp16:
        model, optimizer = amp.initialize(
            model,
            optimizer,
            opt_level=apex_opt_level,
            keep_batchnorm_fp32=True,
            loss_scale="dynamic",
        )

    max_acc = train(
        model=model,
        optimizer_name=optimizer_name,
        optimizer=optimizer,
        train_dir=train_dir,
        dev_dir=dev_dir,
        test_dir=test_dir,
        output_dir=output_dir,
        batch_size=batch_size,
        initial_epoch=0,
        num_epoch=num_epoch,
        max_acc=0.0,
        hide_map_prob=hide_map_prob,
        num_load_files_training=num_load_files_training,
        fp16=fp16,
        amp_opt_level=apex_opt_level if fp16 else None,
        save_checkpoints=save_checkpoints,
        save_every=save_every,
        save_best=save_best,
    )

    print(f"Training finished, max accuracy in the development set {max_acc}")

Example #4

Show file

File: train.py Project: tamlovincy/Self-Driving-Car-in-Video-Games

def train(
    model: TEDD1104,
    optimizer_name: str,
    optimizer: torch.optim,
    train_dir: str,
    dev_dir: str,
    test_dir: str,
    output_dir: str,
    batch_size: int,
    initial_epoch: int,
    num_epoch: int,
    max_acc: float,
    hide_map_prob: float,
    num_load_files_training: int,
    fp16: bool = True,
    amp_opt_level=None,
    save_checkpoints: bool = True,
    save_every: int = 100,
    save_best: bool = True,
):
    """
    Train a model

    Input:
    - model: TEDD1104 model to train
    - optimizer_name: Name of the optimizer to use [SGD, Adam]
    - optimizer: Optimizer (torch.optim)
    - train_dir: Directory where the train files are stored
    - dev_dir: Directory where the development files are stored
    - test_dir: Directory where the test files are stored
    - output_dir: Directory where the model and the checkpoints are going to be saved
    - batch_size: Batch size (Around 10 for 8GB GPU)
    - initial_epoch: Number of previous epochs used to train the model (0 unless the model has been
      restored from checkpoint)
    - num_epochs: Number of epochs to do
    - max_acc: Accuracy in the development set (0 unless the model has been
      restored from checkpoint)
    - hide_map_prob: Probability for removing the minimap (black square) from the image (0<=hide_map_prob<=1)
    - fp16: Use FP16 for training
    - amp_opt_level: If FP16 training Nvidia apex opt level
    - save_checkpoints: save a checkpoint each epoch (Each checkpoint will rewrite the previous one)
    - save_best: save the model that achieves the higher accuracy in the development set

    Output:
     - float: Accuracy in the development test of the best model
    """

    if fp16:
        try:
            from apex import amp
        except ImportError:
            raise ImportError(
                "Please install apex from https://www.github.com/nvidia/apex to use fp16 training."
            )

    criterion: CrossEntropyLoss = torch.nn.CrossEntropyLoss()
    print("Loading dev set")
    X_dev, y_dev = load_dataset(dev_dir, fp=16 if fp16 else 32)
    X_dev = torch.from_numpy(X_dev)
    print("Loading test set")
    X_test, y_test = load_dataset(test_dir, fp=16 if fp16 else 32)
    X_test = torch.from_numpy(X_test)

    acc_dev: float = 0.0
    total_training_exampels: int = 0
    printTrace("Training...")
    for epoch in range(num_epoch):
        iteration_no = 0
        num_used_files: int = 0
        files: List[str] = glob.glob(os.path.join(train_dir, "*.npz"))
        random.shuffle(files)
        # Get files in batches, all files will be loaded and data will be shuffled
        for paths in batch(files, num_load_files_training):
            iteration_no += 1
            num_used_files += num_load_files_training
            model.train()
            start_time: float = time.time()

            X, y = load_and_shuffle_datasets(paths=paths,
                                             fp=16 if fp16 else 32,
                                             hide_map_prob=hide_map_prob)
            total_training_exampels += len(y)
            running_loss = 0.0
            num_batchs = 0

            for X_bacth, y_batch in nn_batchs(X, y, batch_size):
                X_bacth, y_batch = (
                    torch.from_numpy(X_bacth).to(device),
                    torch.from_numpy(y_batch).long().to(device),
                )
                optimizer.zero_grad()
                outputs = model.forward(X_bacth)
                loss = criterion(outputs, y_batch)
                if fp16:
                    with amp.scale_loss(loss, optimizer) as scaled_loss:
                        scaled_loss.backward()
                else:
                    loss.backward()

                if fp16:
                    torch.nn.utils.clip_grad_norm_(
                        amp.master_params(optimizer), 1.0)
                else:
                    torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0)

                optimizer.step()
                running_loss += loss.item()
                num_batchs += 1
            start_time_eval: float = time.time()
            # Print Statistics
            if len(X) > 0 and len(y) > 0:
                acc_train = evaluate(
                    model=model,
                    X=torch.from_numpy(X),
                    golds=y,
                    device=device,
                    batch_size=batch_size,
                )
            else:
                acc_train = -1.0

            acc_dev = evaluate(
                model=model,
                X=X_dev,
                golds=y_dev,
                device=device,
                batch_size=batch_size,
            )

            acc_test = evaluate(
                model=model,
                X=X_test,
                golds=y_test,
                device=device,
                batch_size=batch_size,
            )

            printTrace(
                f"EPOCH: {initial_epoch+epoch}. Iteration {iteration_no}. "
                f"{num_used_files} of {len(files)} files. "
                f"Total examples used for training {total_training_exampels}. "
                f"Iteration time: {time.time() - start_time} secs. Eval time: {time.time() - start_time_eval} secs."
            )

            printTrace(
                f"Loss: {-1 if num_batchs == 0 else running_loss / num_batchs}. Acc training set: {acc_train}. "
                f"Acc dev set: {acc_dev}. Acc test set: {acc_test}")

            if acc_dev > max_acc and save_best:
                max_acc = acc_dev
                printTrace(
                    f"New max acc in dev set {max_acc}. Saving model...")
                save_model(
                    model=model,
                    save_dir=output_dir,
                    fp16=fp16,
                    amp_opt_level=amp_opt_level,
                )

            if save_checkpoints and iteration_no % save_every == 0:
                printTrace("Saving checkpoint...")
                save_checkpoint(
                    path=os.path.join(output_dir, "checkpoint.pt"),
                    model=model,
                    optimizer_name=optimizer_name,
                    optimizer=optimizer,
                    acc_dev=acc_dev,
                    epoch=initial_epoch + epoch,
                    fp16=fp16,
                    opt_level=amp_opt_level,
                )

    return max_acc

Example #5

Show file

File: train.py Project: kezewang/Self-Driving-Car-in-Video-Games

def train(
    model: TEDD1104,
    optimizer_name: str,
    optimizer: torch.optim,
    scheduler: torch.optim.lr_scheduler,
    train_dir: str,
    dev_dir: str,
    test_dir: str,
    output_dir: str,
    batch_size: int,
    accumulation_steps: int,
    initial_epoch: int,
    num_epoch: int,
    max_acc: float,
    hide_map_prob: float,
    dropout_images_prob: List[float],
    num_load_files_training: int,
    fp16: bool = True,
    amp_opt_level=None,
    save_checkpoints: bool = True,
    eval_every: int = 5,
    save_every: int = 20,
    save_best: bool = True,
):
    """
    Train a model

    Input:
    - model: TEDD1104 model to train
    - optimizer_name: Name of the optimizer to use [SGD, Adam]
    - optimizer: Optimizer (torch.optim)
    - train_dir: Directory where the train files are stored
    - dev_dir: Directory where the development files are stored
    - test_dir: Directory where the test files are stored
    - output_dir: Directory where the model and the checkpoints are going to be saved
    - batch_size: Batch size (Around 10 for 8GB GPU)
    - initial_epoch: Number of previous epochs used to train the model (0 unless the model has been
      restored from checkpoint)
    - num_epochs: Number of epochs to do
    - max_acc: Accuracy in the development set (0 unless the model has been
      restored from checkpoint)
    - hide_map_prob: Probability for removing the minimap (put a black square)
       from a training example (0<=hide_map_prob<=1)
    - dropout_images_prob List of 5 floats or None, probability for removing each input image during training
     (black image) from a training example (0<=dropout_images_prob<=1)
    - fp16: Use FP16 for training
    - amp_opt_level: If FP16 training Nvidia apex opt level
    - save_checkpoints: save a checkpoint each epoch (Each checkpoint will rewrite the previous one)
    - save_best: save the model that achieves the higher accuracy in the development set

    Output:
     - float: Accuracy in the development test of the best model
    """
    writer: SummaryWriter = SummaryWriter()

    if fp16:
        try:
            from apex import amp
        except ImportError:
            raise ImportError(
                "Please install apex from https://www.github.com/nvidia/apex to use fp16 training."
            )

    criterion: CrossEntropyLoss = torch.nn.CrossEntropyLoss()
    print("Loading dev set")
    X_dev, y_dev = load_dataset(dev_dir, fp=16 if fp16 else 32)
    X_dev = torch.from_numpy(X_dev)
    print("Loading test set")
    X_test, y_test = load_dataset(test_dir, fp=16 if fp16 else 32)
    X_test = torch.from_numpy(X_test)
    total_training_exampels: int = 0
    model.zero_grad()

    printTrace("Training...")
    for epoch in range(num_epoch):
        step_no: int = 0
        iteration_no: int = 0
        num_used_files: int = 0
        data_loader = DataLoaderTEDD(
            dataset_dir=train_dir,
            nfiles2load=num_load_files_training,
            hide_map_prob=hide_map_prob,
            dropout_images_prob=dropout_images_prob,
            fp=16 if fp16 else 32,
        )

        data = data_loader.get_next()
        # Get files in batches, all files will be loaded and data will be shuffled
        while data:
            X, y = data
            model.train()
            start_time: float = time.time()
            total_training_exampels += len(y)
            running_loss: float = 0.0
            num_batchs: int = 0
            acc_dev: float = 0.0

            for X_bacth, y_batch in nn_batchs(X, y, batch_size):
                X_bacth, y_batch = (
                    torch.from_numpy(X_bacth).to(device),
                    torch.from_numpy(y_batch).long().to(device),
                )

                #print(X_bacth)
                outputs = model.forward(X_bacth)
                #print(outputs.size())
                #print(y_batch)
                loss = criterion(outputs, y_batch) / accumulation_steps
                running_loss += loss.item()

                if fp16:
                    with amp.scale_loss(loss, optimizer) as scaled_loss:
                        scaled_loss.backward()
                else:
                    loss.backward()

                if fp16:
                    torch.nn.utils.clip_grad_norm_(
                        amp.master_params(optimizer), 1.0)
                else:
                    torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0)

                if (step_no + 1) % accumulation_steps or (
                        num_used_files + 1 >
                        len(data_loader) - num_load_files_training
                        and num_batchs == math.ceil(len(y) / batch_size) - 1
                ):  # If we are in the last bach of the epoch we also want to perform gradient descent
                    optimizer.step()
                    model.zero_grad()

                num_batchs += 1
                step_no += 1

            num_used_files += num_load_files_training

            # Print Statistics
            printTrace(
                f"EPOCH: {initial_epoch+epoch}. Iteration {iteration_no}. "
                f"{num_used_files} of {len(data_loader)} files. "
                f"Total examples used for training {total_training_exampels}. "
                f"Iteration time: {round(time.time() - start_time,2)} secs.")
            printTrace(
                f"Loss: {-1 if num_batchs == 0 else running_loss / num_batchs}. "
                f"Learning rate {optimizer.state_dict()['param_groups'][0]['lr']}"
            )
            writer.add_scalar("Loss/train", running_loss / num_batchs,
                              iteration_no)

            scheduler.step(running_loss / num_batchs)

            if (iteration_no + 1) % eval_every == 0:
                start_time_eval: float = time.time()
                if len(X) > 0 and len(y) > 0:
                    acc_train: float = evaluate(
                        model=model,
                        X=torch.from_numpy(X),
                        golds=y,
                        device=device,
                        batch_size=batch_size,
                    )
                else:
                    acc_train = -1.0

                acc_dev: float = evaluate(
                    model=model,
                    X=X_dev,
                    golds=y_dev,
                    device=device,
                    batch_size=batch_size,
                )

                acc_test: float = evaluate(
                    model=model,
                    X=X_test,
                    golds=y_test,
                    device=device,
                    batch_size=batch_size,
                )

                printTrace(
                    f"Acc training set: {round(acc_train,2)}. "
                    f"Acc dev set: {round(acc_dev,2)}. "
                    f"Acc test set: {round(acc_test,2)}.  "
                    f"Eval time: {round(time.time() - start_time_eval,2)} secs."
                )

                if 0.0 < acc_dev > max_acc and save_best:
                    max_acc = acc_dev
                    printTrace(
                        f"New max acc in dev set {round(max_acc,2)}. Saving model..."
                    )
                    save_model(
                        model=model,
                        save_dir=output_dir,
                        fp16=fp16,
                        amp_opt_level=amp_opt_level,
                    )
                if acc_train > -1:
                    writer.add_scalar("Accuracy/train", acc_train,
                                      iteration_no)
                writer.add_scalar("Accuracy/dev", acc_dev, iteration_no)
                writer.add_scalar("Accuracy/test", acc_test, iteration_no)

            if save_checkpoints and (iteration_no + 1) % save_every == 0:
                printTrace("Saving checkpoint...")
                save_checkpoint(
                    path=os.path.join(output_dir, "checkpoint.pt"),
                    model=model,
                    optimizer_name=optimizer_name,
                    optimizer=optimizer,
                    scheduler=scheduler,
                    acc_dev=acc_dev,
                    epoch=initial_epoch + epoch,
                    fp16=fp16,
                    opt_level=amp_opt_level,
                )

            iteration_no += 1
            data = data_loader.get_next()

        data_loader.close()

    return max_acc

Example #6

Show file

def train(
    model: TEDD1104,
    optimizer_name: str,
    optimizer: torch.optim,
    scheduler: torch.optim.lr_scheduler,
    train_dir: str,
    dev_dir: str,
    test_dir: str,
    output_dir: str,
    batch_size: int,
    accumulation_steps: int,
    initial_epoch: int,
    num_epoch: int,
    max_acc: float,
    hide_map_prob: float,
    dropout_images_prob: List[float],
    num_load_files_training: int,
    fp16: bool = True,
    amp_opt_level=None,
    save_checkpoints: bool = True,
    eval_every: int = 5,
    save_every: int = 20,
    save_best: bool = True,
):

    """
    Train a model

    Input:
    - model: TEDD1104 model to train
    - optimizer_name: Name of the optimizer to use [SGD, Adam]
    - optimizer: Optimizer (torch.optim)
    - train_dir: Directory where the train files are stored
    - dev_dir: Directory where the development files are stored
    - test_dir: Directory where the test files are stored
    - output_dir: Directory where the model and the checkpoints are going to be saved
    - batch_size: Batch size (Around 10 for 8GB GPU)
    - initial_epoch: Number of previous epochs used to train the model (0 unless the model has been
      restored from checkpoint)
    - num_epochs: Number of epochs to do
    - max_acc: Accuracy in the development set (0 unless the model has been
      restored from checkpoint)
    - hide_map_prob: Probability for removing the minimap (put a black square)
       from a training example (0<=hide_map_prob<=1)
    - dropout_images_prob List of 5 floats or None, probability for removing each input image during training
     (black image) from a training example (0<=dropout_images_prob<=1)
    - fp16: Use FP16 for training
    - amp_opt_level: If FP16 training Nvidia apex opt level
    - save_checkpoints: save a checkpoint each epoch (Each checkpoint will rewrite the previous one)
    - save_best: save the model that achieves the higher accuracy in the development set

    Output:
     - float: Accuracy in the development test of the best model
    """
    writer: SummaryWriter = SummaryWriter()

    if fp16:
        try:
            from apex import amp
        except ImportError:
            raise ImportError(
                "Please install apex from https://www.github.com/nvidia/apex to use fp16 training."
            )

    criterion: CrossEntropyLoss = torch.nn.CrossEntropyLoss()
    print("Loading dev set")
    X_dev, y_dev = load_dataset(dev_dir, fp=16 if fp16 else 32)
    X_dev = torch.from_numpy(X_dev)
    print("Loading test set")
    X_test, y_test = load_dataset(test_dir, fp=16 if fp16 else 32)
    X_test = torch.from_numpy(X_test)
    total_training_exampels: int = 0
    model.zero_grad()


    trainLoader = DataLoader(dataset=PickleDataset(train_dir), batch_size=batch_size, shuffle=
False, num_workers=8)

    printTrace("Training...")
    iteration_no: int = 0
    for epoch in range(num_epoch):
        #step_no: int = 0
        #num_used_files: int = 0

        print('EpochNum: ' + str(epoch))
        model.train()
        start_time: float = time.time()
        running_loss: float = 0.0
        acc_dev: float = 0.0

        for num_batchs, inputs in enumerate(trainLoader):
            X_bacth = torch.reshape(inputs[0], (inputs[0].shape[0] * 5, 3, inputs[0].shape[2], inputs[0].shape[3])).to(device)
            y_batch = torch.reshape(inputs[1], (inputs[0].shape[0],)).long().to(device)
            #print(X_bacth)
            #X_bacth, y_batch = (
            #    torch.from_numpy(batch_data).to(device),
            #    torch.from_numpy(inputs[1]).long().to(device),
            #)

            outputs = model.forward(X_bacth)
            #print(outputs.size())
            #print(y_batch)
            loss = criterion(outputs, y_batch) / accumulation_steps
            running_loss += loss.item()

            if fp16:
                with amp.scale_loss(loss, optimizer) as scaled_loss:
                    scaled_loss.backward()
            else:
                loss.backward()

            if fp16:
                torch.nn.utils.clip_grad_norm_(amp.master_params(optimizer), 1.0)
            else:
                torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0)

            optimizer.step()
            model.zero_grad()

        #scheduler.step(running_loss)

        # Print Statistics
        printTrace(
            f"Loss: {running_loss/num_batchs}. "
            f"Learning rate {optimizer.state_dict()['param_groups'][0]['lr']}"
        )

        writer.add_scalar("Loss/train", running_loss, iteration_no)

        if (iteration_no + 1) % eval_every == 0:
            start_time_eval: float = time.time()

            acc_dev: float = evaluate(
                model=model,
                X=X_dev,
                golds=y_dev,
                device=device,
                batch_size=batch_size,
            )

            acc_test: float = evaluate(
                model=model,
                X=X_test,
                golds=y_test,
                device=device,
                batch_size=batch_size,
            )

            printTrace(
                f"Acc dev set: {round(acc_dev,2)}. "
                f"Acc test set: {round(acc_test,2)}.  "
                f"Eval time: {round(time.time() - start_time_eval,2)} secs."
            )

            if 0.0 < acc_dev > max_acc and save_best:
                max_acc = acc_dev
                printTrace(
                    f"New max acc in dev set {round(max_acc,2)}. Saving model..."
                )
                save_model(
                    model=model,
                    save_dir=output_dir,
                    fp16=fp16,
                    amp_opt_level=amp_opt_level,
                )
            writer.add_scalar("Accuracy/dev", acc_dev, iteration_no)
            writer.add_scalar("Accuracy/test", acc_test, iteration_no)

        if save_checkpoints and (iteration_no + 1) % save_every == 0:
            printTrace("Saving checkpoint...")
            save_checkpoint(
                path=os.path.join(output_dir, "checkpoint.pt"),
                model=model,
                optimizer_name=optimizer_name,
                optimizer=optimizer,
                scheduler=scheduler,
                acc_dev=acc_dev,
                epoch=initial_epoch + epoch,
                fp16=fp16,
                opt_level=amp_opt_level,
            )

        iteration_no += 1

    return max_acc

Example #7

Show file

File: train.py Project: zivzone/Self-Driving-Car-in-Video-Games

def train_new_model(
    train_dir="Data\\GTAV-AI\\data-v2\\train\\",
    dev_dir="Data\\GTAV-AI\\data-v2\\dev\\",
    test_dir="Data\\GTAV-AI\\data-v2\\test\\",
    output_dir="Data\\models\\",
    batch_size=10,
    accumulation_steps: int = 1,
    num_epoch=20,
    optimizer_name="SGD",
    learning_rate: float = 0.01,
    scheduler_patience: int = 10000,
    resnet: int = 18,
    pretrained_resnet: bool = True,
    sequence_size: int = 5,
    embedded_size: int = 256,
    hidden_size: int = 128,
    num_layers_lstm: int = 1,
    bidirectional_lstm: bool = False,
    layers_out: List[int] = None,
    dropout_cnn: float = 0.1,
    dropout_cnn_out: float = 0.1,
    dropout_lstm: float = 0.1,
    dropout_lstm_out: float = 0.1,
    hide_map_prob: float = 0.0,
    dropout_images_prob=None,
    fp16=True,
    save_checkpoints=True,
    save_every: int = 20,
    save_best=True,
):
    """
    Train a new model

    Input:
    - train_dir: Directory where the train files are stored
    - dev_dir: Directory where the development files are stored
    - test_dir: Directory where the test files are stored
    - output_dir: Directory where the model and the checkpoints are going to be saved
    - batch_size: Batch size (Around 10 for 8GB GPU)
    - num_epochs: Number of epochs to do
    - optimizer_name: Name of the optimizer to use [SGD, Adam]
    - optimizer: Optimizer (torch.optim)
    - resnet: resnet module to use [18,34,50,101,152]
    - pretrained_resnet: Load pretrained resnet weights
    - sequence_size: Length of each series of features
    - embedded_size: Size of the feature vectors
    - hidden_size: LSTM hidden size
    - num_layers_lstm: number of layers in the LSTM
    - bidirectional_lstm: forward or bidirectional LSTM
    - layers_out: list of integer, for each integer i a linear layer with i neurons will be added.
    - dropout_cnn: dropout probability for the CNN layers
    - dropout_cnn_out: dropout probability for the cnn features (output layer)
    - dropout_lstm: dropout probability for the LSTM
    - dropout_lstm_out: dropout probability for the LSTM features (output layer)
    - hide_map_prob: Probability for removing the minimap (put a black square)
      from a training example (0<=hide_map_prob<=1)
    - dropout_images_prob List of 5 floats or None, probability for removing each input image during training
     (black image) from a training example (0<=dropout_images_prob<=1)
    - fp16: Use FP16 for training
    - amp_opt_level: If FP16 training Nvidia apex opt level
    - save_checkpoints: save a checkpoint each epoch (Each checkpoint will rewrite the previous one)
    - save_best: save the model that achieves the higher accuracy in the development set

    Output:

    """

    print("Loading new model")
    model: TEDD1104 = TEDD1104(
        resnet=resnet,
        pretrained_resnet=pretrained_resnet,
        sequence_size=sequence_size,
        embedded_size=embedded_size,
        hidden_size=hidden_size,
        num_layers_lstm=num_layers_lstm,
        bidirectional_lstm=bidirectional_lstm,
        layers_out=layers_out,
        dropout_cnn=dropout_cnn,
        dropout_cnn_out=dropout_cnn_out,
        dropout_lstm=dropout_lstm,
        dropout_lstm_out=dropout_lstm_out,
    ).to(device)

    if optimizer_name == "SGD":
        optimizer = optim.SGD(model.parameters(),
                              lr=learning_rate,
                              momentum=0.9,
                              nesterov=True)
    elif optimizer_name == "Adam":
        optimizer = optim.Adam(model.parameters(),
                               lr=learning_rate,
                               weight_decay=0.01)
    else:
        raise ValueError(
            f"Optimizer {optimizer_name} not implemented. Available optimizers: SGD, Adam"
        )

    scheduler = optim.lr_scheduler.ReduceLROnPlateau(
        optimizer, verbose=True, patience=scheduler_patience, factor=0.5)

    max_acc = train(
        model=model,
        optimizer_name=optimizer_name,
        optimizer=optimizer,
        scheduler=scheduler,
        train_dir=train_dir,
        dev_dir=dev_dir,
        test_dir=test_dir,
        output_dir=output_dir,
        batch_size=batch_size,
        accumulation_steps=accumulation_steps,
        initial_epoch=0,
        num_epoch=num_epoch,
        running_loss=0.0,
        total_batches=0,
        total_training_examples=0,
        max_acc=0.0,
        hide_map_prob=hide_map_prob,
        dropout_images_prob=dropout_images_prob,
        fp16=fp16,
        scaler=GradScaler() if fp16 else None,
        save_checkpoints=save_checkpoints,
        save_every=save_every,
        save_best=save_best,
    )

    print(f"Training finished, max accuracy in the development set {max_acc}")

Example #8

Show file

File: train.py Project: zivzone/Self-Driving-Car-in-Video-Games

def train(
    model: TEDD1104,
    optimizer_name: str,
    optimizer: torch.optim,
    scheduler: torch.optim.lr_scheduler,
    scaler: GradScaler,
    train_dir: str,
    dev_dir: str,
    test_dir: str,
    output_dir: str,
    batch_size: int,
    accumulation_steps: int,
    initial_epoch: int,
    num_epoch: int,
    running_loss: float,
    total_batches: int,
    total_training_examples: int,
    max_acc: float,
    hide_map_prob: float,
    dropout_images_prob: List[float],
    fp16: bool = True,
    save_checkpoints: bool = True,
    save_every: int = 20,
    save_best: bool = True,
):
    """
    Train a model

    Input:
    - model: TEDD1104 model to train
    - optimizer_name: Name of the optimizer to use [SGD, Adam]
    - optimizer: Optimizer (torch.optim)
    - train_dir: Directory where the train files are stored
    - dev_dir: Directory where the development files are stored
    - test_dir: Directory where the test files are stored
    - output_dir: Directory where the model and the checkpoints are going to be saved
    - batch_size: Batch size (Around 10 for 8GB GPU)
    - initial_epoch: Number of previous epochs used to train the model (0 unless the model has been
      restored from checkpoint)
    - num_epochs: Number of epochs to do
    - max_acc: Accuracy in the development set (0 unless the model has been
      restored from checkpoint)
    - hide_map_prob: Probability for removing the minimap (put a black square)
       from a training example (0<=hide_map_prob<=1)
    - dropout_images_prob List of 5 floats or None, probability for removing each input image during training
     (black image) from a training example (0<=dropout_images_prob<=1)
    - fp16: Use FP16 for training
    - save_checkpoints: save a checkpoint each epoch (Each checkpoint will rewrite the previous one)
    - save_best: save the model that achieves the higher accuracy in the development set

    Output:
     - float: Accuracy in the development test of the best model
    """

    if not os.path.exists(output_dir):
        print(f"{output_dir} does not exits. We will create it.")
        os.makedirs(output_dir)

    writer: SummaryWriter = SummaryWriter()

    criterion: CrossEntropyLoss = torch.nn.CrossEntropyLoss().to(device)
    model.zero_grad()
    print_message("Training...")
    for epoch in range(num_epoch):
        acc_dev: float = 0.0
        num_batches: int = 0
        step_no: int = 0

        data_loader_train = DataLoader(
            Tedd1104Dataset(
                dataset_dir=train_dir,
                hide_map_prob=hide_map_prob,
                dropout_images_prob=dropout_images_prob,
            ),
            batch_size=batch_size,
            shuffle=True,
            num_workers=os.cpu_count(),
            pin_memory=True,
        )
        start_time: float = time.time()
        step_start_time: float = time.time()
        dataloader_delay: float = 0
        model.train()
        for batch in data_loader_train:

            x = torch.flatten(
                torch.stack(
                    (
                        batch["image1"],
                        batch["image2"],
                        batch["image3"],
                        batch["image4"],
                        batch["image5"],
                    ),
                    dim=1,
                ),
                start_dim=0,
                end_dim=1,
            ).to(device)

            y = batch["y"].to(device)
            dataloader_delay += time.time() - step_start_time

            total_training_examples += len(y)

            if fp16:
                with autocast():
                    outputs = model.forward(x)
                    loss = criterion(outputs, y)
                    loss = loss / accumulation_steps

                running_loss += loss.item()
                scaler.scale(loss).backward()

            else:
                outputs = model.forward(x)
                loss = criterion(outputs, y) / accumulation_steps
                running_loss += loss.item()
                loss.backward()

            if ((step_no + 1) % accumulation_steps == 0) or (
                    step_no + 1 >= len(data_loader_train)
            ):  # If we are in the last bach of the epoch we also want to perform gradient descent
                if fp16:
                    # Gradient clipping
                    scaler.unscale_(optimizer)
                    torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0)

                    scaler.step(optimizer)
                    scaler.update()
                    optimizer.zero_grad()
                else:
                    # Gradient clipping
                    torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0)

                    optimizer.step()
                    optimizer.zero_grad()

                total_batches += 1
                num_batches += 1
                scheduler.step(running_loss / total_batches)

                batch_time = round(time.time() - start_time, 2)
                est: float = batch_time * (math.ceil(
                    len(data_loader_train) / accumulation_steps) - num_batches)
                print_message(
                    f"EPOCH: {initial_epoch + epoch}. "
                    f"{num_batches} of {math.ceil(len(data_loader_train)/accumulation_steps)} batches. "
                    f"Total examples used for training {total_training_examples}. "
                    f"Iteration time: {batch_time} secs. "
                    f"Data Loading bottleneck: {round(dataloader_delay, 2)} secs. "
                    f"Epoch estimated time: "
                    f"{str(datetime.timedelta(seconds=est)).split('.')[0]}")

                print_message(
                    f"Loss: {running_loss / total_batches}. "
                    f"Learning rate {optimizer.state_dict()['param_groups'][0]['lr']}"
                )

                writer.add_scalar("Loss/train", running_loss / total_batches,
                                  total_batches)

                if save_checkpoints and (total_batches + 1) % save_every == 0:
                    print_message("Saving checkpoint...")
                    save_checkpoint(
                        path=os.path.join(output_dir, "checkpoint.pt"),
                        model=model,
                        optimizer_name=optimizer_name,
                        optimizer=optimizer,
                        scheduler=scheduler,
                        running_loss=running_loss,
                        total_batches=total_batches,
                        total_training_examples=total_training_examples,
                        acc_dev=max_acc,
                        epoch=initial_epoch + epoch,
                        fp16=fp16,
                        scaler=None if not fp16 else scaler,
                    )

                dataloader_delay: float = 0
                start_time: float = time.time()

            step_no += 1
            step_start_time = time.time()

        del data_loader_train

        print_message("Dev set evaluation...")

        start_time_eval: float = time.time()

        data_loader_dev = DataLoader(
            Tedd1104Dataset(
                dataset_dir=dev_dir,
                hide_map_prob=0,
                dropout_images_prob=[0, 0, 0, 0, 0],
            ),
            batch_size=batch_size //
            2,  # Use smaller batch size to prevent OOM issues
            shuffle=False,
            num_workers=os.cpu_count() // 2,  # Use less cores to save RAM
            pin_memory=True,
        )

        acc_dev: float = evaluate(
            model=model,
            data_loader=data_loader_dev,
            device=device,
            fp16=fp16,
        )

        del data_loader_dev

        print_message("Test set evaluation...")
        data_loader_test = DataLoader(
            Tedd1104Dataset(
                dataset_dir=test_dir,
                hide_map_prob=0,
                dropout_images_prob=[0, 0, 0, 0, 0],
            ),
            batch_size=batch_size //
            2,  # Use smaller batch size to prevent OOM issues
            shuffle=False,
            num_workers=os.cpu_count() // 2,  # Use less cores to save RAM
            pin_memory=True,
        )

        acc_test: float = evaluate(
            model=model,
            data_loader=data_loader_test,
            device=device,
            fp16=fp16,
        )

        del data_loader_test

        print_message(
            f"Acc dev set: {round(acc_dev*100,2)}. "
            f"Acc test set: {round(acc_test*100,2)}.  "
            f"Eval time: {round(time.time() - start_time_eval,2)} secs.")

        if 0.0 < acc_dev > max_acc and save_best:
            max_acc = acc_dev
            print_message(
                f"New max acc in dev set {round(max_acc, 2)}. Saving model...")
            save_model(
                model=model,
                save_dir=output_dir,
                fp16=fp16,
            )

        writer.add_scalar("Accuracy/dev", acc_dev, epoch)
        writer.add_scalar("Accuracy/test", acc_test, epoch)

    return max_acc