def test_id_wise_sampling_does_not_put_id_in_multiple_dataloaders(): data = list() num_ids = 150 sample_id = 0 for sequence_id in range(num_ids): for sequence in range(40): data.append([sequence, sample_id, sequence_id]) sample_id += 1 df = pd.DataFrame(data, columns=["x", "y", "ID"]) # Hyperparameters batch_size = 10 num_past = 10 num_future = 5 train_split_ratio = 0.333 validation_split_ratio = 0.333 dataloaders = dataset.MultiModalDataLoader( df, batch_size=batch_size, n_past=num_past, n_future=num_future, num_workers=1, train_split_ratio=train_split_ratio, validation_split_ratio=validation_split_ratio, scale=False, ) verify_sequential_id_sampled_sequential_dataloaders_equal_dataloaders( dataloaders, train_split_ratio, validation_split_ratio, num_ids)
def test_category_wise_sampling_few_categories(): data = list() num_categories = 5 for category in range(num_categories): for sequence in range(40 + int(category / 14)): data.append([sequence, sequence, category]) df = pd.DataFrame(data, columns=['x', 'y', 'ID']) # Hyperparameters batch_size = 1 num_past = 10 num_future = 5 train_split_ratio = 0.5 validation_split_ratio = 0.2 dataloaders = dataset.MultiModalDataLoader( df, batch_size=batch_size, n_past=num_past, n_future=num_future, num_workers=1, train_split_ratio=train_split_ratio, validation_split_ratio=validation_split_ratio) verify_category_wise_sampled_dataloaders(dataloaders, train_split_ratio, validation_split_ratio, num_categories)
def test_time_based_sampling_dataloaders_do_not_overlap(): data = list() num_ids = 140 sequence_length = 2000 # Hyperparameters batch_size = 10 num_past = 10 num_future = 5 train_split_ratio = 0.501 validation_split_ratio = 0.25 split_by_id = False # The test condition # The train[0] column should contain only 1s, the test column should contain 2s and the # validation column set should contain 3s. # When scaled, this translates to -1., 0 and 1. respectively. for sample_id in range(num_ids): for element in range(round(sequence_length * train_split_ratio)): data.append([1, element, sample_id]) for element in range( round(sequence_length * (1 - train_split_ratio - validation_split_ratio))): data.append([2, element, sample_id]) for element in range(round(sequence_length * validation_split_ratio)): data.append([3, element, sample_id]) df = pd.DataFrame(data, columns=["x", "y", "ID"]) dataloaders = dataset.MultiModalDataLoader( df, batch_size=batch_size, n_past=num_past, n_future=num_future, num_workers=1, train_split_ratio=train_split_ratio, validation_split_ratio=validation_split_ratio, split_by_id=split_by_id, ) for data, target, ids, parameters, classes in dataloaders["train_loader"]: for sequence in data: assert all(sample == -1.0 for sample in sequence[:, 0]) for sequence in target: assert all(sample == -1.0 for sample in sequence[:, 0]) for data, target, ids, parameters, classes in dataloaders["test_loader"]: for sequence in data: assert all(sample == 0 for sample in sequence[:, 0]) for sequence in target: assert all(sample == 0 for sample in sequence[:, 0]) for data, target, ids, parameters, classes in dataloaders[ "validation_loader"]: for sequence in data: assert all(sample == 1 for sample in sequence[:, 0]) for sequence in target: assert all(sample == 1 for sample in sequence[:, 0])
def test_ae(): """ Test Autoencoder and variational auto encoder models for training/testing/generative network and classification networks """ # Sample data df = jaguar() # Hyperparameters batch_size = 10 num_past = 10 num_future = 5 # Prepare the dataloader data_loaders = dataset.MultiModalDataLoader(df, batch_size=batch_size, n_past=num_past, n_future=num_future, num_workers=1, train_split_ratio=0.5, validation_split_ratio=0.2) model_save_path = './model.pt' model = MultiModelAE(input_size=2, num_past=num_past, batch_size=batch_size, num_future=num_future, lstm_hidden_size=32, num_lstm_layers=2, output_size=2, latent_size=10, batch_first=True, dropout=0.1, reset_state=True, bidirectional=False, num_classifier_layers=4, classifier_hidden_size=32, num_classes=9) # Model Trainer # Model types; "ae" or "vae" trainer = HybridTrainer(model=model, optimizer_type='Adam', loss_type='huber') # Train the model trainer.fit(data_loaders, model_save_path, epochs=10, training_mode='forecasting') trainer.fit(data_loaders, model_save_path, epochs=10, training_mode='classification')
def test_lstm_jaguar(): """ Testing method for lstm model used for forecasting. """ # Sample data df = jaguar() # Hyperparameters batch_size = 10 num_past = 10 num_future = 10 # For timeseries prediction assert num_past == num_future # Prepare the dataloader data_loaders = dataset.MultiModalDataLoader( df, batch_size=batch_size, n_past=num_past, n_future=num_future, num_workers=1 ) model_save_path = "./model.pt" # Model init model = LSTM( input_size=2, hidden_size=32, num_layers=2, output_size=2, dropout=0.1, batch_size=batch_size, num_future=num_future, bidirectional=False, batch_first=True, reset_state=True, ) # Model Trainer trainer = HybridTrainer(model=model, optimizer_type='Adam', loss_type='huber') forecasting_loss_pre_training, _, _ = trainer.validate(data_loaders['train_loader']) print(f'Loss pre training: {forecasting_loss_pre_training}') # Train the model trainer.fit(data_loaders, model_save_path, epochs=2, training_mode="forecasting", validate_every=1, test_every=2) forecasting_loss_post_training, _, _ = trainer.validate(data_loaders['train_loader']) print(f'Loss post training: {forecasting_loss_post_training}') assert forecasting_loss_post_training < forecasting_loss_pre_training
def test_sequential_data_loader_indices_are_sequential(): data = list() num_ids = 46 for sample_id in range(num_ids): for sequence in range(40 + int(sample_id / 14)): data.append([sequence, sequence, sample_id]) df = pd.DataFrame(data, columns=["x", "y", "ID"]) # Hyperparameters batch_size = 18 num_past = 13 num_future = 8 train_split_ratio = 0.5 validation_split_ratio = 0.2 stride = 1 dataloaders = dataset.MultiModalDataLoader( df, batch_size=batch_size, n_past=num_past, n_future=num_future, num_workers=1, train_split_ratio=train_split_ratio, validation_split_ratio=validation_split_ratio, stride=stride, ) current_id = 0 for data, target, ids, parameters, classes in dataloaders[ "sequential_train_loader"]: for id in ids: id = int(id) if id > current_id: current_id = id assert ( id == current_id ), "IDs in sequential train loader should increase monotonically!" current_id = 0 for data, target, ids, parameters, classes in dataloaders[ "sequential_test_loader"]: for id in ids: id = int(id) if id > current_id: current_id = id assert ( id == current_id ), "IDs in sequential test loader should increase monotonically!"
def test_ae_jaguar(): """ Test autoencoder forecasting with the Jaguar dataset """ # Sample data df = jaguar() # Hyperparameters batch_size = 10 num_past = 10 num_future = 5 # Prepare the dataloader data_loaders = dataset.MultiModalDataLoader( df, batch_size=batch_size, n_past=num_past, n_future=num_future, num_workers=1, train_split_ratio=0.5, validation_split_ratio=0.2, ) model_save_path = "./model.pt" model = MultiModelAE( input_size=2, num_past=num_past, batch_size=batch_size, num_future=num_future, lstm_hidden_size=32, num_lstm_layers=2, output_size=2, latent_size=10, batch_first=True, dropout=0.1, reset_state=True, bidirectional=False, ) # Model Trainer # Model types; "ae" or "vae" trainer = HybridTrainer(model=model, optimizer_type="Adam", loss_type="huber") # Train the model trainer.fit(data_loaders, model_save_path, epochs=5, training_mode="forecasting", validate_every=2, test_every=5) trainer.fit(data_loaders, model_save_path, epochs=5, training_mode="forecasting", validate_every=None, test_every=5) trainer.fit(data_loaders, model_save_path, epochs=5, training_mode="forecasting", validate_every=2, test_every=None) trainer.validate(data_loaders["sequential_validation_loader"])
def test_plot_prediction(): # Hyperparameters batch_size = 10 num_past = 10 num_future = 10 input_size = 2 lstm_hidden_size = 512 lstm_num_layers = 4 batch_first = True reset_state = True output_size = 2 num_classes = 9 latent_size = 20 dropout = 0.1 bidirectional = False # Prepare the dataloader df = jaguar() data_loaders = dataset.MultiModalDataLoader(df, batch_size=batch_size, n_past=num_past, n_future=num_future, num_workers=1) model = MultiModelVAE(input_size=input_size, output_size=output_size, lstm_hidden_size=lstm_hidden_size, num_lstm_layers=lstm_num_layers, num_classes=num_classes, latent_size=latent_size, dropout=dropout, num_classifier_layers=4, classifier_hidden_size=32, batch_size=batch_size, num_future=num_future, num_past=num_past, bidirectional=bidirectional, batch_first=batch_first, reset_state=reset_state) trainer = HybridTrainer(model=model, optimizer_type='Adam', loss_type='huber') model_save_path = './model.pt' plot_prediction(model, data_loaders['sequential_test_loader'], 1)
def test_time_based_weighted_sampling_dataloaders_do_not_overlap(): data = list() num_ids = 232 sample_id = 0 for sequence_id in range(num_ids): for sequence in range(40 + (int(sequence_id * 2.234) % 117)): data.append([sequence, sample_id, sequence_id]) sample_id += 1 df = pd.DataFrame(data, columns=["x", "y", "ID"]) # Hyperparameters batch_size = 10 num_past = 10 num_future = 5 train_split_ratio = 0.333 validation_split_ratio = 0.333 dataloaders = dataset.MultiModalDataLoader( df, batch_size=batch_size, n_past=num_past, n_future=num_future, num_workers=1, train_split_ratio=train_split_ratio, validation_split_ratio=validation_split_ratio, scale=False, split_by_id=False, weighted_sampling=True, stride=1, ) train_ids = extract_sample_ids_from_dataloader(dataloaders["train_loader"]) test_ids = extract_sample_ids_from_dataloader(dataloaders["test_loader"]) validation_ids = extract_sample_ids_from_dataloader( dataloaders["validation_loader"]) sequential_train_ids = extract_sample_ids_from_dataloader( dataloaders["sequential_train_loader"]) sequential_test_ids = extract_sample_ids_from_dataloader( dataloaders["sequential_test_loader"]) sequential_validation_ids = extract_sample_ids_from_dataloader( dataloaders["sequential_validation_loader"]) verify_that_indices_belong_to_precisely_one_loader(train_ids, test_ids, validation_ids) verify_that_indices_belong_to_precisely_one_loader( sequential_train_ids, sequential_test_ids, sequential_validation_ids)
def test_lstm(): """ Testing method for lstm model used for forecasting. """ # Sample data df = jaguar() # Hyperparameters batch_size = 10 num_past = 10 num_future = 10 # For timeseries prediction assert num_past == num_future # Prepare the dataloader data_loaders = dataset.MultiModalDataLoader(df, batch_size=batch_size, n_past=num_past, n_future=num_future, num_workers=1) model_save_path = './model.pt' # Model init model = LSTM(input_size=2, hidden_size=32, num_layers=2, output_size=2, dropout=0.1, batch_size=batch_size, num_future=num_future, bidirectional=False, batch_first=True, reset_state=True) # Model Trainer trainer = HybridTrainer(model=model, optimizer_type='Adam', loss_type='huber') # Train the model trainer.fit(data_loaders, model_save_path, epochs=10, training_mode='forecasting')
def test_id_wise_sampling_with_short_sequences_does_not_divide_by_zero(): data = list() num_ids = 283 sample_id = 0 for sequence_id in range(num_ids): for sequence in range( 1 + (sequence_id % 74)): # Some sequences will generate zero time series data.append([sequence, sample_id, sequence_id]) sample_id += 1 df = pd.DataFrame(data, columns=["x", "y", "ID"]) # Hyperparameters batch_size = 1 num_past = 10 num_future = 5 train_split_ratio = 0.333 validation_split_ratio = 0.333 dataloaders = dataset.MultiModalDataLoader( df, batch_size=batch_size, n_past=num_past, n_future=num_future, num_workers=1, train_split_ratio=train_split_ratio, validation_split_ratio=validation_split_ratio, scale=False, ) verify_sequential_id_sampled_sequential_dataloaders_equal_dataloaders( dataloaders, train_split_ratio, validation_split_ratio, num_ids, expect_all_ids=False, )
def test_vae_classification_network_converges(): """ Test that Autoencoder and variational auto encoder models for classification networks converge """ data = list() num_ids = 8 for sample_id in range(num_ids): sample_class = sample_id % 2 for sequence in range(70 + sample_id * 4): xx = sample_class * np.sin(sequence / 20.0) + (sample_class - 1) * sequence yy = sample_class * np.cos(sequence / 20.0) + (sample_class - 1) * sequence data.append([xx, yy, sample_id, sample_class]) # Sample data df = pd.DataFrame(data, columns=['x', 'y', 'ID', 'class']) # Hyperparameters batch_size = 2 num_past = 10 num_future = 5 # Prepare the dataloader data_loaders = dataset.MultiModalDataLoader(df, batch_size=batch_size, n_past=num_past, n_future=num_future, train_split_ratio=0.333, validation_split_ratio=0.333, num_workers=1, split_by_id=False, stride=1) model_save_path = './model.pt' model = MultiModelVAE(input_size=2, output_size=2, lstm_hidden_size=32, num_lstm_layers=2, num_classes=2, latent_size=10, dropout=0.1, num_classifier_layers=4, classifier_hidden_size=32, batch_size=batch_size, num_future=num_future, num_past=num_past, bidirectional=False, batch_first=True, reset_state=True) # Test resetting the classifier, to make sure this function works model.reset_classifier(classifier_hidden_size=32, num_classifier_layers=4) # Model Trainer # Model types; "ae" or "vae" trainer = HybridTrainer(model=model, optimizer_type='Adam', loss_type='mse') _, _, classification_loss_pre_training = trainer.validate(data_loaders['train_loader']) print(f'Loss pre training: {classification_loss_pre_training}') # Train the model trainer.fit(data_loaders, model_save_path, epochs=2, training_mode="forecasting", validate_every=1, test_every=2) trainer.fit(data_loaders, model_save_path, epochs=2, training_mode="classification", validate_every=1, test_every=2) _, _, classification_loss_post_training = trainer.validate(data_loaders['train_loader']) print(f'Loss post training: {classification_loss_post_training}') assert classification_loss_post_training < classification_loss_pre_training
def test_vae_regression_network_converges(): """ Test that Autoencoder and variational auto encoder models for regression networks converge """ data = list() num_ids = 3 for sample_id in range(num_ids): for sequence in range(70 + sample_id * 4): parameter_one = 0.2 * sample_id parameter_two = 91.235 * sample_id data.append([sequence, sequence, sample_id, parameter_one, parameter_two]) # Sample data df = pd.DataFrame(data, columns=['x', 'y', 'ID', 'parameter_one', 'parameter_two']) parameter_columns = ['parameter_one', 'parameter_two'] # Hyperparameters batch_size = 1 num_past = 10 num_future = 5 # Prepare the dataloader data_loaders = dataset.MultiModalDataLoader(df, batch_size=batch_size, n_past=num_past, n_future=num_future, train_split_ratio=0.333, validation_split_ratio=0.333, num_workers=1, parameter_columns=parameter_columns, split_by_id=False, stride=1) model_save_path = './model.pt' model = MultiModelVAE(input_size=2, output_size=2, lstm_hidden_size=32, num_lstm_layers=2, num_regressor_parameters=len(parameter_columns), latent_size=10, dropout=0.1, num_regressor_layers=4, regressor_hidden_size=32, batch_size=batch_size, num_future=num_future, num_past=num_past, bidirectional=False, batch_first=True, reset_state=True) # Test resetting the regressor, to make sure this function works model.reset_regressor(regressor_hidden_size=32, num_regressor_layers=4) # Model Trainer # Model types; "ae" or "vae" trainer = HybridTrainer(model=model, optimizer_type="Adam", loss_type="mse") _, regression_lost_pre_training, _ = trainer.validate(data_loaders['train_loader']) print(f'Loss pre training: {regression_lost_pre_training}') # Train the model trainer.fit(data_loaders, model_save_path, epochs=2, training_mode="forecasting", validate_every=1, test_every=2) trainer.fit(data_loaders, model_save_path, epochs=2, training_mode="regression", validate_every=1, test_every=2) _, regression_lost_post_training, _ = trainer.validate(data_loaders['train_loader']) print(f'Loss post training: {regression_lost_post_training}') assert regression_lost_post_training < regression_lost_pre_training
def test_aevae_jaguar(): """ Test variational autoencoder forecasting with the Jaguar dataset """ # Sample data df = jaguar() # Hyperparameters batch_size = 10 num_past = 10 num_future = 5 # Prepare the dataloader data_loaders = dataset.MultiModalDataLoader( df, batch_size=batch_size, n_past=num_past, n_future=num_future, train_split_ratio=0.5, num_workers=1, split_by_id=False, ) model_save_path = "./model.pt" model = MultiModelVAE( input_size=2, output_size=2, lstm_hidden_size=32, num_lstm_layers=2, latent_size=10, dropout=0.1, batch_size=batch_size, num_future=num_future, num_past=num_past, bidirectional=False, batch_first=True, reset_state=True, ) # Test that we can run functions on our network. model.disable_latent_output() model.enable_latent_output() # Model Trainer # Model types; "ae" or "vae" trainer = HybridTrainer(model=model, optimizer_type="Adam", loss_type="huber") # Train the model trainer.fit(data_loaders, model_save_path, epochs=10, training_mode="forecasting", validate_every=5, test_every=10) scaler = data_loaders["train_loader"].dataset.scaler # Load the trained model given the path model_path = "./model.pt" hyperparams = "./hypers.json" model_hyperparameters = traja.models.read_hyperparameters(hyperparams) # For prebuild traja generative models generator = traja.models.inference.Generator( model_type="vae", model_hyperparameters=model_hyperparameters, model_path=model_path, model=None, ) out = generator.generate(num_future, classify=False, scaler=scaler, plot_data=False) trainer.validate(data_loaders["validation_loader"])