Esempio n. 1
0
def baseline(args):
    args = parse_arguments(args)

    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

    file_helper = FileHelper()
    train_helper = TrainHelper(device)
    train_helper.seed_torch(seed=args.seed)

    model_name = train_helper.get_filename_from_baseline_params(args)
    run_folder = file_helper.get_run_folder(args.folder, model_name)

    metrics_helper = MetricsHelper(run_folder, args.seed)

    # get sender and receiver models and save them
    sender, receiver, diagnostic_receiver = get_sender_receiver(device, args)

    sender_file = file_helper.get_sender_path(run_folder)
    receiver_file = file_helper.get_receiver_path(run_folder)
    # torch.save(sender, sender_file)

    if receiver:
        torch.save(receiver, receiver_file)

    model = get_trainer(
        sender,
        device,
        args.dataset_type,
        receiver=receiver,
        diagnostic_receiver=diagnostic_receiver,
        vqvae=args.vqvae,
        rl=args.rl,
        entropy_coefficient=args.entropy_coefficient,
        myopic=args.myopic,
        myopic_coefficient=args.myopic_coefficient,
    )

    model_path = file_helper.create_unique_model_path(model_name)

    best_accuracy = -1.0
    epoch = 0
    iteration = 0

    if args.resume_training or args.test_mode:
        epoch, iteration, best_accuracy = load_model_state(model, model_path)
        print(
            f"Loaded model. Resuming from - epoch: {epoch} | iteration: {iteration} | best accuracy: {best_accuracy}"
        )

    if not os.path.exists(file_helper.model_checkpoint_path):
        print("No checkpoint exists. Saving model...\r")
        torch.save(model.visual_module, file_helper.model_checkpoint_path)
        print("No checkpoint exists. Saving model...Done")

    train_data, valid_data, test_data, valid_meta_data, _ = get_training_data(
        device=device,
        batch_size=args.batch_size,
        k=args.k,
        debugging=args.debugging,
        dataset_type=args.dataset_type,
    )

    train_meta_data, valid_meta_data, test_meta_data = get_meta_data()

    # dump arguments
    pickle.dump(args, open(f"{run_folder}/experiment_params.p", "wb"))

    pytorch_total_params = sum(p.numel() for p in model.parameters())

    if not args.disable_print:
        # Print info
        print("----------------------------------------")
        print(
            "Model name: {} \n|V|: {}\nL: {}".format(
                model_name, args.vocab_size, args.max_length
            )
        )
        print(sender)
        if receiver:
            print(receiver)

        if diagnostic_receiver:
            print(diagnostic_receiver)

        print("Total number of parameters: {}".format(pytorch_total_params))

    model.to(device)

    optimizer = torch.optim.Adam(model.parameters(), lr=args.lr)

    # Train
    current_patience = args.patience
    best_accuracy = -1.0
    converged = False

    start_time = time.time()

    if args.test_mode:
        test_loss_meter, test_acc_meter, _ = train_helper.evaluate(
            model, test_data, test_meta_data, device, args.rl
        )

        average_test_accuracy = test_acc_meter.avg
        average_test_loss = test_loss_meter.avg

        print(
            f"TEST results: loss: {average_test_loss} | accuracy: {average_test_accuracy}"
        )
        return

    iterations = []
    losses = []
    hinge_losses = []
    rl_losses = []
    entropies = []
    accuracies = []

    while iteration < args.iterations:
        for train_batch in train_data:
            print(f"{iteration}/{args.iterations}       \r", end="")

            ### !!! This is the complete training procedure. Rest is only logging!
            _, _ = train_helper.train_one_batch(
                model, train_batch, optimizer, train_meta_data, device
            )

            if iteration % args.log_interval == 0:

                if not args.rl:
                    valid_loss_meter, valid_acc_meter, _, = train_helper.evaluate(
                        model, valid_data, valid_meta_data, device, args.rl
                    )
                else:
                    valid_loss_meter, hinge_loss_meter, rl_loss_meter, entropy_meter, valid_acc_meter, _ = train_helper.evaluate(
                        model, valid_data, valid_meta_data, device, args.rl
                    )

                new_best = False

                average_valid_accuracy = valid_acc_meter.avg

                if (
                    average_valid_accuracy < best_accuracy
                ):  # No new best found. May lead to early stopping
                    current_patience -= 1

                    if current_patience <= 0:
                        print("Model has converged. Stopping training...")
                        converged = True
                        break
                else:  # new best found. Is saved.
                    new_best = True
                    best_accuracy = average_valid_accuracy
                    current_patience = args.patience
                    save_model_state(model, model_path, epoch, iteration, best_accuracy)

                # Skip for now  <--- What does this comment mean? printing is not disabled, so this will be shown, right?
                if not args.disable_print:

                    if not args.rl:
                        print(
                            "{}/{} Iterations: val loss: {}, val accuracy: {}".format(
                                iteration,
                                args.iterations,
                                valid_loss_meter.avg,
                                valid_acc_meter.avg,
                            )
                        )
                    else:
                        print(
                            "{}/{} Iterations: val loss: {}, val hinge loss: {}, val rl loss: {}, val entropy: {}, val accuracy: {}".format(
                                iteration,
                                args.iterations,
                                valid_loss_meter.avg,
                                hinge_loss_meter.avg,
                                rl_loss_meter.avg,
                                entropy_meter.avg,
                                valid_acc_meter.avg,
                            )
                        )

                iterations.append(iteration)
                losses.append(valid_loss_meter.avg)
                if args.rl:
                    hinge_losses.append(hinge_loss_meter.avg)
                    rl_losses.append(rl_loss_meter.avg)
                    entropies.append(entropy_meter.avg)
                accuracies.append(valid_acc_meter.avg)

            iteration += 1
            if iteration >= args.iterations:
                break

        epoch += 1

        if converged:
            break

    # prepare writing of data
    dir_path = os.path.dirname(os.path.realpath(__file__))
    dir_path = dir_path.replace("/baseline", "")
    timestamp = str(datetime.datetime.now())
    filename = "output_data/vqvae_{}_rl_{}_dc_{}_gs_{}_dln_{}_dld_{}_beta_{}_entropy_coefficient_{}_myopic_{}_mc_{}_seed_{}_{}.csv".format(
        args.vqvae,
        args.rl,
        args.discrete_communication,
        args.gumbel_softmax,
        args.discrete_latent_number,
        args.discrete_latent_dimension,
        args.beta,
        args.entropy_coefficient,
        args.myopic,
        args.myopic_coefficient,
        args.seed,
        timestamp,
    )
    full_filename = os.path.join(dir_path, filename)

    # write data
    d = [iterations, losses, hinge_losses, rl_losses, entropies, accuracies]
    export_data = zip_longest(*d, fillvalue="")
    with open(full_filename, "w", encoding="ISO-8859-1", newline="") as myfile:
        wr = csv.writer(myfile)
        wr.writerow(
            ("iteration", "loss", "hinge loss", "rl loss", "entropy", "accuracy")
        )
        wr.writerows(export_data)
    myfile.close()

    # plotting
    print(filename)
    plot_data(filename, args)

    return run_folder
Esempio n. 2
0
def baseline(args):
    args = parse_arguments(args)

    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

    file_helper = FileHelper()
    train_helper = TrainHelper(device)
    train_helper.seed_torch(seed=args.seed)

    model_name = train_helper.get_filename_from_baseline_params(args)
    run_folder = file_helper.get_run_folder(args.folder, model_name)

    logger = Logger(run_folder, print_logs=(not args.disable_print))
    logger.log_args(args)

    # get sender and receiver models and save them
    sender, receiver, diagnostic_receiver = get_sender_receiver(device, args)

    sender_file = file_helper.get_sender_path(run_folder)
    receiver_file = file_helper.get_receiver_path(run_folder)
    # torch.save(sender, sender_file)

    if receiver:
        torch.save(receiver, receiver_file)

    model = get_trainer(
        sender,
        device,
        args.dataset_type,
        receiver=receiver,
        diagnostic_receiver=diagnostic_receiver,
        vqvae=args.vqvae,
        rl=args.rl,
        entropy_coefficient=args.entropy_coefficient,
        myopic=args.myopic,
        myopic_coefficient=args.myopic_coefficient,
    )

    model_path = file_helper.create_unique_model_path(model_name)

    best_accuracy = -1.0
    epoch = 0
    iteration = 0

    if args.resume_training or args.test_mode:
        epoch, iteration, best_accuracy = load_model_state(model, model_path)
        print(
            f"Loaded model. Resuming from - epoch: {epoch} | iteration: {iteration} | best accuracy: {best_accuracy}"
        )

    if not os.path.exists(file_helper.model_checkpoint_path):
        print("No checkpoint exists. Saving model...\r")
        torch.save(model.visual_module, file_helper.model_checkpoint_path)
        print("No checkpoint exists. Saving model...Done")

    train_data, valid_data, test_data, valid_meta_data, _ = get_training_data(
        device=device,
        batch_size=args.batch_size,
        k=args.k,
        debugging=args.debugging,
        dataset_type=args.dataset_type,
    )

    train_meta_data, valid_meta_data, test_meta_data = get_meta_data()

    pytorch_total_params = sum(p.numel() for p in model.parameters())

    if not args.disable_print:
        # Print info
        print("----------------------------------------")
        print("Model name: {} \n|V|: {}\nL: {}".format(model_name,
                                                       args.vocab_size,
                                                       args.max_length))
        print(sender)
        if receiver:
            print(receiver)

        if diagnostic_receiver:
            print(diagnostic_receiver)

        print("Total number of parameters: {}".format(pytorch_total_params))

    model.to(device)

    optimizer = torch.optim.Adam(model.parameters(), lr=args.lr)

    # Train
    current_patience = args.patience
    best_accuracy = -1.0
    converged = False

    start_time = time.time()

    if args.test_mode:
        test_loss_meter, test_acc_meter, _ = train_helper.evaluate(
            model, test_data, test_meta_data, device, args.rl)

        average_test_accuracy = test_acc_meter.avg
        average_test_loss = test_loss_meter.avg

        print(
            f"TEST results: loss: {average_test_loss} | accuracy: {average_test_accuracy}"
        )
        return

    while iteration < args.iterations:
        for train_batch in train_data:
            print(f"{iteration}/{args.iterations}       \r", end="")

            # !!! This is the complete training procedure. Rest is only logging!
            _, _ = train_helper.train_one_batch(model, train_batch, optimizer,
                                                train_meta_data, device)

            if iteration % args.log_interval == 0:

                if not args.rl:
                    valid_loss_meter, valid_acc_meter, _, = train_helper.evaluate(
                        model, valid_data, valid_meta_data, device, args.rl)
                else:
                    valid_loss_meter, hinge_loss_meter, rl_loss_meter, entropy_meter, valid_acc_meter, _ = train_helper.evaluate(
                        model, valid_data, valid_meta_data, device, args.rl)

                new_best = False

                average_valid_accuracy = valid_acc_meter.avg

                if (average_valid_accuracy < best_accuracy
                    ):  # No new best found. May lead to early stopping
                    current_patience -= 1

                    if current_patience <= 0:
                        print("Model has converged. Stopping training...")
                        converged = True
                        break
                else:  # new best found. Is saved.
                    new_best = True
                    best_accuracy = average_valid_accuracy
                    current_patience = args.patience
                    save_model_state(model, model_path, epoch, iteration,
                                     best_accuracy)

                metrics = {
                    'loss': valid_loss_meter.avg,
                    'accuracy': valid_acc_meter.avg,
                }
                if args.rl:
                    metrics['hinge loss'] = hinge_loss_meter.avg
                    metrics['rl loss'] = rl_loss_meter.avg
                    metrics['entropy'] = entropy_meter.avg

                logger.log_metrics(iteration, metrics)

            iteration += 1
            if iteration >= args.iterations:
                break

        epoch += 1

        if converged:
            break

    return run_folder