Ejemplo n.º 1
0
def load(features_path, features_format, to_tensor=True, to_cuda=True):
    loader = loaders[features_format]
    features = loader(features_path)
    if to_tensor:
        features = torch.from_numpy(features)
        if to_cuda:
            features = to_cuda_if_available(features)
    return features
Ejemplo n.º 2
0
def train(vae,
          train_data,
          test_data,
          output_path,
          output_loss_path,
          batch_size=100,
          start_epoch=0,
          num_epochs=100,
          l2_regularization=0.001,
          learning_rate=0.001,
          variable_sizes=None,
          max_seconds_without_save=300):
    start_time = time.time()
    vae = to_cuda_if_available(vae)

    optim = Adam(vae.parameters(),
                 weight_decay=l2_regularization,
                 lr=learning_rate)

    logger = Logger(output_loss_path, append=start_epoch > 0)

    saver = Saver({vae: output_path}, logger, max_seconds_without_save)

    trainer = Trainer(vae, train_data, test_data, batch_size, optim,
                      variable_sizes)

    for epoch_index in range(start_epoch, num_epochs):
        # train vae
        logger.start_timer()
        train_losses = trainer.train()
        logger.log(epoch_index, num_epochs, "vae", "train_mean_loss",
                   np.mean(train_losses))

        # test imputation
        logger.start_timer()
        test_loss = trainer.test()
        logger.log(epoch_index, num_epochs, "vae", "test_loss", test_loss)

        # save models for the epoch
        saver.delayed_save()

    saver.save()
    logger.close()
    print("Total time: {:02f}s".format(time.time() - start_time))
Ejemplo n.º 3
0
def main(args=None):
    options_parser = argparse.ArgumentParser(description="Impute missing values with iterative VAE. "
                                                         + "Define 'temperature' to use multi-output.")

    options_parser.add_argument("data", type=str, help="See 'data_format' parameter.")

    options_parser.add_argument("metadata", type=str,
                                help="Information about the categorical variables in json format.")

    options_parser.add_argument("model", type=str, help="Model input file.")

    options_parser.add_argument("output_loss", type=str, help="Loss output file.")

    options_parser.add_argument(
        "--data_format",
        type=str,
        default="sparse",
        choices=data_formats,
        help="Either a dense numpy array, a sparse csr matrix or any of those formats in split into several files."
    )

    options_parser.add_argument(
        "--split_size",
        type=int,
        default=128,
        help="Dimension of the VAE latent space."
    )

    options_parser.add_argument(
        "--code_size",
        type=int,
        default=128,
        help="Dimension of the VAE latent space."
    )

    options_parser.add_argument(
        "--encoder_hidden_sizes",
        type=str,
        default="",
        help="Size of each hidden layer in the encoder separated by commas (no spaces)."
    )

    options_parser.add_argument(
        "--decoder_hidden_sizes",
        type=str,
        default="",
        help="Size of each hidden layer in the decoder separated by commas (no spaces)."
    )

    options_parser.add_argument(
        "--max_iterations",
        type=int,
        default=1000,
        help="Maximum number of iterations."
    )

    options_parser.add_argument(
        "--tolerance",
        type=float,
        default=0.001,
        help="Minimum RMSE to continue iterating."
    )

    options_parser.add_argument(
        "--temperature",
        type=float,
        default=None,
        help="Gumbel-Softmax temperature. Only used if metadata is also provided."
    )

    options_parser.add_argument(
        "--missing_probability",
        type=float,
        default=0.5,
        help="Probability of a value being missing."
    )

    options_parser.add_argument(
        "--noise_learning_rate",
        type=float,
        help="Learning rate to use backpropagation and modify the noise."
    )

    options_parser.add_argument("--seed", type=int, help="Random number generator seed.", default=42)

    options = options_parser.parse_args(args=args)

    seed_all(options.seed)

    variable_sizes = load_variable_sizes_from_metadata(options.metadata)

    features = load(options.data, options.data_format)

    mask = generate_mask_for(features, options.missing_probability, variable_sizes)
    mask = to_cuda_if_available(mask)

    if options.temperature is not None:
        temperature = options.temperature
    else:
        temperature = None

    vae = VAE(
        features.shape[1],
        options.split_size,
        options.code_size,
        encoder_hidden_sizes=parse_int_list(options.encoder_hidden_sizes),
        decoder_hidden_sizes=parse_int_list(options.decoder_hidden_sizes),
        variable_sizes=(None if temperature is None else variable_sizes),  # do not use multi-output without temperature
        temperature=temperature
    )

    load_without_cuda(vae, options.model)

    impute(
        vae,
        features,
        mask,
        create_parent_directories_if_needed(options.output_loss),
        max_iterations=options.max_iterations,
        tolerance=options.tolerance,
        variable_sizes=variable_sizes,
        noise_learning_rate=options.noise_learning_rate
    )
Ejemplo n.º 4
0
def impute(vae,
           features,
           mask,
           output_loss_path,
           max_iterations=1000,
           tolerance=1e-3,
           variable_sizes=None,
           noise_learning_rate=None
           ):
    start_time = time.time()
    vae = to_cuda_if_available(vae)

    logger = Logger(output_loss_path, append=False)

    loss_function = MSELoss()

    inverted_mask = 1 - mask

    observed = features * mask
    missing = torch.randn_like(features)

    if noise_learning_rate is not None:
        missing = Variable(missing, requires_grad=True)
        optim = Adam([missing], weight_decay=0, lr=noise_learning_rate)

    vae.train(mode=True)

    for iteration in range(max_iterations):
        logger.start_timer()

        if noise_learning_rate is not None:
            optim.zero_grad()

        noisy_features = observed + missing * inverted_mask
        _, reconstructed, _, _ = vae(noisy_features, training=True)

        observed_loss = masked_reconstruction_loss_function(reconstructed,
                                                            features,
                                                            mask,
                                                            variable_sizes)

        missing_loss = masked_reconstruction_loss_function(reconstructed,
                                                           features,
                                                           inverted_mask,
                                                           variable_sizes)

        loss = torch.sqrt(loss_function(compose_with_mask(features, reconstructed, mask), features))

        if noise_learning_rate is None:
            missing = reconstructed * inverted_mask
        else:
            observed_loss.backward()
            optim.step()

        observed_loss, missing_loss, loss = to_cpu_if_available(observed_loss, missing_loss, loss)
        observed_loss = observed_loss.data.numpy()
        missing_loss = missing_loss.data.numpy()
        loss = loss.data.numpy()

        logger.log(iteration, max_iterations, "vae", "observed_loss", observed_loss)
        logger.log(iteration, max_iterations, "vae", "missing_loss", missing_loss)
        logger.log(iteration, max_iterations, "vae", "loss", loss)

        if observed_loss < tolerance:
            break

    logger.close()
    print("Total time: {:02f}s".format(time.time() - start_time))
Ejemplo n.º 5
0
def create_noisy_dataset(features, missing_probability, variable_sizes, return_all=False):
    mask = generate_mask_for(features, missing_probability, variable_sizes)
    mask = to_cuda_if_available(mask)
    return NoisyDataset(features, mask, return_all=return_all)
Ejemplo n.º 6
0
def smooth_label_ones_like(batch):
    smooth_label_ones = Variable(torch.FloatTensor(len(batch)).uniform_(0.9, 1))
    return to_cuda_if_available(smooth_label_ones)
Ejemplo n.º 7
0
def label_zeros_like(batch):
    label_zeros = Variable(torch.zeros(len(batch)))
    return to_cuda_if_available(label_zeros)
Ejemplo n.º 8
0
def generate_noise(num_samples, num_features):
    noise = Variable(torch.FloatTensor(num_samples, num_features).normal_())
    return to_cuda_if_available(noise)
Ejemplo n.º 9
0
def generate_hint_for(mask, hint_probability, variable_sizes):
    return mask * to_cuda_if_available(
        generate_mask_for(mask, 1.0 - hint_probability, variable_sizes))
Ejemplo n.º 10
0
def train(
    generator,
    discriminator,
    train_data,
    test_data,
    output_gen_path,
    output_disc_path,
    output_loss_path,
    batch_size=64,
    start_epoch=0,
    num_epochs=10000,
    num_disc_steps=1,
    num_gen_steps=1,
    l2_regularization=0,
    learning_rate=0.001,
    variable_sizes=None,
    reconstruction_loss_weight=1,
    hint_probability=0.9,
    max_seconds_without_save=300,
    early_stopping_patience=100,
):
    start_time = time.time()
    generator, discriminator = to_cuda_if_available(generator, discriminator)

    optim_gen = Adam(generator.parameters(),
                     weight_decay=l2_regularization,
                     lr=learning_rate)
    optim_disc = Adam(discriminator.parameters(),
                      weight_decay=l2_regularization,
                      lr=learning_rate)

    logger = Logger(output_loss_path, append=start_epoch > 0)

    saver = Saver({
        generator: output_gen_path,
        discriminator: output_disc_path
    }, logger, max_seconds_without_save)

    trainer = Trainer(train_data, test_data, generator, discriminator,
                      optim_gen, optim_disc, batch_size, variable_sizes,
                      num_disc_steps, num_gen_steps,
                      reconstruction_loss_weight, hint_probability)

    # initialize early stopping
    best_test_mean_loss = None
    bad_epochs = 0

    for epoch_index in range(start_epoch, num_epochs):
        # train discriminator and generator
        logger.start_timer()
        disc_losses, gen_losses = trainer.train()
        logger.log(epoch_index, num_epochs, "discriminator", "train_mean_loss",
                   np.mean(disc_losses))
        logger.log(epoch_index, num_epochs, "generator", "train_mean_loss",
                   np.mean(gen_losses))

        # test imputation
        logger.start_timer()
        reconstruction_losses = trainer.test()
        test_mean_loss = np.mean(reconstruction_losses)
        logger.log(epoch_index, num_epochs, "generator", "test_mean_loss",
                   test_mean_loss)

        # check if the test loss is improving
        if best_test_mean_loss is None or test_mean_loss < best_test_mean_loss:
            best_test_mean_loss = test_mean_loss
            bad_epochs = 0

            # save models for the epoch
            saver.delayed_save(keep_parameters=True)

        # if the test loss is not improving check if early stopping should be executed
        else:
            bad_epochs += 1
            if bad_epochs >= early_stopping_patience:
                break

    saver.save(only_use_kept=True)
    logger.close()
    print("Total time: {:02f}s".format(time.time() - start_time))