def trainAndTest(dataset, enable_data_augmentation = False, percentage_similarity_loss = 0, LSTM = False, EPOCHS = 500, enable_same_noise = False, save_output = True, NlogN = True): X_train, y_train, X_test, y_test, info = py_ts_data.load_data(dataset, variables_as_channels=True) print("Dataset shape: Train: {}, Test: {}".format(X_train.shape, X_test.shape)) print(np.shape(y_train)) if enable_data_augmentation or len(X_train) >= 1000: # LSTM will greatly extend the training time, so disable it if we have large data LSTM = False title = "{}-DA:{}-CoefSimilar:{}-LSTM:{}".format(dataset, enable_data_augmentation, percentage_similarity_loss, LSTM) ##### Preprocess Data #### num_train = len(X_train) if num_train < 1000 and enable_data_augmentation: X_train= augment_data(X_train, enable_same_noise = enable_same_noise) num_train = len(X_train) # randomly generate N pairs: # NlogN woule be int(num_train * math.log2(num_train)) if NlogN: num_of_pairs = num_train * int(math.log2(num_train)) else: num_of_pairs = num_train X, Y = generateRandomPairs(num_of_pairs, X_train) # NlogN is too large, for N = 1000, NlogN would be 10K normalized_X, normalized_Y, distance = calculatePreSBD(X, Y) ###### Training Stage ##### kwargs = { "input_shape": (X_train.shape[1], X_train.shape[2]), "filters": [32, 64, 128], "kernel_sizes": [5, 5, 5], "code_size": 16, } ae = AutoEncoder(**kwargs) # # Training loss_history = [] t1 = time.time() for epoch in range(EPOCHS): if epoch % 100 == 50: print("Epoch {}/{}".format(epoch, EPOCHS)) total_loss = train_step(normalized_X, normalized_Y, distance, ae, alpha = percentage_similarity_loss, LSTM = LSTM) loss_history.append(total_loss) # print("Epoch {}: {}".format(epoch, total_loss), end="\r") print("The training time for dataset {} is: {}".format(dataset, (time.time() - t1) / 60)) #%% plt.clf() plt.xlabel("epoch starting from 5") plt.ylabel("loss") plt.title("Loss vs epoch") plt.plot(loss_history[5:]) # plt.show() if save_output: if not os.path.isdir(ouput_dir_name + dataset): os.mkdir(ouput_dir_name + dataset) with open(ouput_dir_name + dataset + "/record.txt", "a") as f: f.write("Dataset, Data Augmentation, Coefficient of Similarity Loss, LSTM, EPOCHS, Distance Measure, L2 Distance, 10-nn score\n") plt.savefig(ouput_dir_name + dataset + "/" + title + "-loss.png") #%% X_test = normalize(X_test) code_test = ae.encode(X_test, LSTM = LSTM) decoded_test = ae.decode(code_test) plt.clf() plt.plot(X_test[0], label = "Original TS") plt.plot(decoded_test[0], label = "reconstructed TS") if save_output: plt.savefig(ouput_dir_name + dataset + "/" + title + "-reconstruction.png") # plt.show() losses = [] for ground, predict in zip(X_test, decoded_test): losses.append(np.linalg.norm(ground - predict)) L2_distance = np.array(losses).mean() print("Mean L2 distance: {}".format(L2_distance)) #%% from sklearn.neighbors import NearestNeighbors nn_x_test = np.squeeze(X_test) baseline_nn = NearestNeighbors(n_neighbors=10, metric = SBD).fit(nn_x_test) code_nn = NearestNeighbors(n_neighbors=10).fit(code_test)# the default metric is euclidean distance # For each item in the test data, find its 11 nearest neighbors in that dataset (the nn is itself) baseline_11nn = baseline_nn.kneighbors(nn_x_test, 11, return_distance=False) code_11nn = code_nn.kneighbors(code_test, 11, return_distance=False) # On average, how many common items are in the 10nn? result = [] for b, c in zip(baseline_11nn, code_11nn): # remove the first nn (itself) b = set(b[1:]) c = set(c[1:]) result.append(len(b.intersection(c))) ten_nn_score = np.array(result).mean() print("10-nn score is:", ten_nn_score) if save_output: with open(ouput_dir_name + dataset + "/record.txt", "a") as f: f.write(",".join([dataset, str(enable_data_augmentation), str(percentage_similarity_loss), str(LSTM), str(EPOCHS), distance_measure, str(round(L2_distance,2)), str(round(ten_nn_score,2)), str(NlogN)]) + "\n")
def main(): parser = argparse.ArgumentParser() parser.add_argument("-a", "--auto", action="store_true", help="autoencoder") parser.add_argument("-e", "--encauto", action="store_true", help="encoder + autoencoder") parser.add_argument("-s", "--seqencauto", action="store_true", help="Encoder(sim) + autoencoder(rec)") parser.add_argument("l") parser.add_argument("filter1") parser.add_argument("filter2") parser.add_argument("filter3") parser.add_argument("epoch") parser.add_argument("batch") args = parser.parse_args() m_type = None if args.auto: m_type = "autoencoder" elif args.encauto: m_type = "encoder_autoencoder" elif args.seqencauto: m_type = "Encoder_sim_autoencoder_rec" else: raise Exception("model type flag not set") model_type_log = "{m_type} lambda={l} filter=[{filter1}, {filter2}, {filter3}] epoch={epoch} batch={batch}".format( m_type=m_type, l=args.l, filter1=args.filter1, filter2=args.filter2, filter3=args.filter3, epoch=args.epoch, batch=args.batch) filters = [int(args.filter1), int(args.filter2), int(args.filter3)] BATCH = int(args.batch) EPOCHS = int(args.epoch) lam = float(args.l) hyperparams["model_type"] = model_type_log hyperparams["epochs"] = EPOCHS hyperparams["batch_size"] = BATCH experiment = Experiment(log_code=False) experiment.log_parameters(LAMBDA) experiment.log_parameters(hyperparams) dataset_name = "GunPoint" X_train, y_train, X_test, y_test, info = py_ts_data.load_data( dataset_name, variables_as_channels=True) print("Dataset shape: Train: {}, Test: {}".format(X_train.shape, X_test.shape)) print(X_train.shape, y_train.shape) X_train, y_train = augmentation(X_train, y_train) # X_test, y_test = augmentation(X_test, y_test) print(X_train.shape, y_train.shape) # fig, axs = plt.subplots(1, 2, figsize=(10, 3)) # axs[0].plot(X_train[200]) X_train = min_max(X_train, feature_range=(-1, 1)) # axs[1].plot(X_train[200]) X_test = min_max(X_test, feature_range=(-1, 1)) # plt.show() kwargs = { "input_shape": (X_train.shape[1], X_train.shape[2]), # "filters": [32, 64, 128], # "filters": [128, 64, 32], "filters": filters, # "filters": [32, 32, 32], # "filters": [32, 32, 16], "kernel_sizes": [5, 5, 5], "code_size": 16, } # lambda_to_test = [0.9, ] # for l in range(1, 10): # lam = l / 10 # lam = 0.99 ae = AutoEncoder(**kwargs) input_shape = kwargs["input_shape"] code_size = kwargs["code_size"] filters = kwargs["filters"] kernel_sizes = kwargs["kernel_sizes"] encoder = Encoder(input_shape, code_size, filters, kernel_sizes) # training SHUFFLE_BUFFER = 100 K = len(set(y_train)) train_dataset = tf.data.Dataset.from_tensor_slices((X_train, y_train)) train_dataset = train_dataset.shuffle(SHUFFLE_BUFFER).batch(BATCH) suffix = "lam={lam}".format(lam=lam) train(ae, encoder, EPOCHS, train_dataset, suffix, experiment, lam, args) code_test = recon_eval(ae, X_test, suffix, experiment) sim_eval(X_test, code_test, suffix, experiment) cwd = os.path.abspath(os.getcwd()) metadata = "lambda_{l}_filter_{filter1}{filter2}{filter3}_epoch_{epoch}_batch_{batch}".format( l=args.l, filter1=args.filter1, filter2=args.filter2, filter3=args.filter3, epoch=args.epoch, batch=args.batch) encoder_path = os.path.join(cwd, m_type, dataset_name, metadata, "encoder") ae_encoder_path = os.path.join(cwd, m_type, dataset_name, metadata, "auto_encoder") ae_decoder_path = os.path.join(cwd, m_type, dataset_name, metadata, "decoder") if not args.auto: encoder.save(encoder_path) ae.encode.save(ae_encoder_path) ae.decode.save(ae_decoder_path) sample_evaluation(ae.encode, ae.encode, ae.decode, experiment, suffix, DATA=dataset_name)
help="dataset to run") PARSER.add_argument('-m', '--models', default="sample_model", required=False, help="dataset to run") ARGS = PARSER.parse_args() DATA = ARGS.dataset MODELS_PATH = ARGS.models ENCODER = tf.keras.models.load_model(os.path.join(MODELS_PATH, DATA, "encoder")) DECODER = tf.keras.models.load_model(os.path.join(MODELS_PATH, DATA, "decoder")) X_TRAIN, Y_TRAIN, X_TEST, Y_TEST, _ = py_ts_data.load_data( DATA, variables_as_channels=True) # all are read in with 3 dims, last is num of variables in the TS assert len(X_TRAIN.shape) == 3 # we care only about univariate TS assert X_TRAIN.shape[2] == 1 X_TRAIN = np.squeeze(X_TRAIN, axis=2) X_TEST = np.squeeze(X_TEST, axis=2) N_NEIGHBORS = 10 N_CLUSTERS = len(set(Y_TRAIN)) CLUSTERING = KMeans(N_CLUSTERS).fit(X_TRAIN) def encoder(x): assert len(x.shape) == 2 x = x[..., np.newaxis]
def main(): experiment = Experiment(log_code=False) experiment.log_parameters(LAMBDA) experiment.log_parameters(hyperparams) dataset_name = "GunPoint" X_train, y_train, X_test, y_test, info = py_ts_data.load_data( dataset_name, variables_as_channels=True) print("Dataset shape: Train: {}, Test: {}".format(X_train.shape, X_test.shape)) print(X_train.shape, y_train.shape) X_train, y_train = augmentation(X_train, y_train) # X_test, y_test = augmentation(X_test, y_test) print(X_train.shape, y_train.shape) # fig, axs = plt.subplots(1, 2, figsize=(10, 3)) # axs[0].plot(X_train[200]) X_train = min_max(X_train, feature_range=(-1, 1)) # axs[1].plot(X_train[200]) X_test = min_max(X_test, feature_range=(-1, 1)) # plt.show() kwargs = { "input_shape": (X_train.shape[1], X_train.shape[2]), # "filters": [32, 64, 128], # "filters": [128, 64, 32], "filters": [64, 32, 16], # "filters": [32, 32, 32], # "filters": [32, 32, 16], "kernel_sizes": [5, 5, 5], "code_size": 16, } # lambda_to_test = [0.9, ] # for l in range(1, 10): # lam = l / 10 lam = 0.99 ae = AutoEncoder(**kwargs) input_shape = kwargs["input_shape"] code_size = kwargs["code_size"] filters = kwargs["filters"] kernel_sizes = kwargs["kernel_sizes"] encoder = Encoder(input_shape, code_size, filters, kernel_sizes) # training SHUFFLE_BUFFER = 100 K = len(set(y_train)) train_dataset = tf.data.Dataset.from_tensor_slices((X_train, y_train)) train_dataset = train_dataset.shuffle(SHUFFLE_BUFFER).batch(BATCH) suffix = "lam={lam}".format(lam=lam) train(ae, encoder, EPOCHS, train_dataset, suffix, experiment, lam) code_test = recon_eval(ae, X_test, suffix, experiment) sim_eval(X_test, code_test, suffix, experiment) encoder.save( r"C:\Users\jiang\Desktop\2270\cs227_final_project\enc_auto_643216_50_50\GunPoint\encoder" ) ae.encode.save( r"C:\Users\jiang\Desktop\2270\cs227_final_project\enc_auto_643216_50_50\GunPoint\auto_encoder" ) ae.decode.save( r"C:\Users\jiang\Desktop\2270\cs227_final_project\enc_auto_643216_50_50\GunPoint\decoder" ) sample_evaluation(ae.encode, ae.encode, ae.decode, experiment, suffix, DATA=dataset_name)