def main(): experiment_name = f'ss_target_{now()}' cv_path = Path(f"result/{experiment_name}") cv_path.mkdir(parents=True) copy_script(cv_path) log = Logger(experiment_name, cv_path / "exp.log") log.info("load data") with log.interval_timer("load data"): train = load_fs('nejumi_ss', conf) train_y = train.SmartScreen train_X = train.loc[:, train.columns != 'SmartScreen'] test = load_fs('nejumi_ss', conf, test=True) test_y = test.SmartScreen test_X = test.loc[:, test.columns != 'SmartScreen'] train_X = pd.concat([train_X, test_X]) train_y = pd.concat([train_y, test_y]) log.info(pformat(list(train_X.columns))) cv = StratifiedKFold(n_splits=5, random_state=conf.seed) cv = cv.split(train_X, train_y) log.info("learning start") log.double_kiritori() with open('features/NN/conf_nejumi_ss.pkl', 'rb') as p: embedd_conf = pickle.load(p) new_emb_conf = list() new_emb_conf.append([c for c in embedd_conf[0] if c[0] != 'SmartScreen']) num_dict = dict() cols, input_size, out_size = embedd_conf[1]['cont'] cols = [c for c in cols if 'SmartScreen' not in c] input_size = len(cols) num_dict['cont'] = cols, input_size, out_size new_emb_conf.append(num_dict) log.info(pformat(new_emb_conf)) import ipdb ipdb.set_trace() meta = NN_cv(train_X, train_y, cv, log, cv_path, split_conf=new_emb_conf) log.double_kiritori() log.info("done") del train_X, train_y np.save(cv_path / "oof_preds.npy", meta)
def main(): experiment_name = now() cv_path = Path(f"result/{experiment_name}") cv_path.mkdir(parents=True) copy_script(cv_path) log = Logger(experiment_name, cv_path / "exp.log") log.info("load data") with log.interval_timer("load data"): train_X = load_fs_tosh('all_snap', conf) train_y = feather.read_dataframe("features/HasDetections.ftr") train_y = train_y.HasDetections test = load_fs_tosh('all_snap', conf, test=True) log.info(pformat(list(train_X.columns))) cv = StratifiedKFold(n_splits=5, random_state=conf.seed) cv = cv.split(train_X, train_y) log.info("learning start") log.double_kiritori() with open('features/NN/conf_tosh_all_snap.pkl', 'rb') as p: embedd_conf = pickle.load(p) log.info(pformat(embedd_conf)) score, pred, meta = NN_cv(train_X, train_y, cv, log, cv_path, X_test=test, split_conf=embedd_conf) log.info(score) log.double_kiritori() log.info("done") del train_X, train_y np.save(cv_path / "test_preds.npy", pred) np.save(cv_path / "oof_preds.npy", meta) make_submission(pred, f"submissions/{experiment_name}.csv.gz")
def run_cvae(): seed = np.random.randint(1, 2147462579) # def sinus_seq(period, samples, length): # X = np.linspace(-np.pi*(samples/period), np.pi*(samples/period), samples) # X = np.reshape(np.sin(X), (-1, length, 1)) # X += np.random.randn(*X.shape)*0.1 # X = (X - np.min(X))/(np.max(X) - np.min(X)) # return X, np.ones((samples/length, 1)) # # X1, y1 = sinus_seq(40, 100000, 50) # X2, y2 = sinus_seq(20, 40000, 50) # # X = np.concatenate((X1, X2)).astype('float32') # y = np.concatenate((y1*0, y2*1), axis=0).astype('int') # # dim_samples, dim_sequence, dim_features = X.shape # X_train, X_test, y_train, y_test = train_test_split(X, y, train_size=0.8) # X, y, users, stats = har.load() n_samples, step = 25, 25 load_data = LoadHAR(add_pitch=False, add_roll=False, add_filter=False, n_samples=n_samples, diff=False, step=step, normalize='segments', comp_magnitude=True, simple_labels=True, common_labels=True) X, y, name, users, stats = load_data.uci_hapt() limited_labels = y < 5 y = y[limited_labels] X = X[limited_labels].astype(np.float32) users = users[limited_labels] X -= X.mean(axis=0) # Compress labels for idx, label in enumerate(np.unique(y)): if not np.equal(idx, label): y[y == label] = idx y_unique = np.unique(y) y = one_hot(y, len(y_unique)) dim_samples, dim_sequence, dim_features = X.shape num_classes = len(y_unique) # Split into train and test stratified by users X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, stratify=users) # Combine in sets train_set = (X_train, y_train) test_set = (X_test, y_test) print('Train size: ', train_set[0].shape) print('Test size: ', test_set[0].shape) n, seq, n_x = train_set[ 0].shape # Datapoints in the dataset, input features. n_batches = n / 100 # The number of batches. bs = n / n_batches # The batchsize. # Initialize the auxiliary deep generative model. # [num_filters, stride, pool] filters = [[128, 1, 2], [128, 1, 2], [128, 1, 2], [128, 1, 2]] model = CVAE(n_x=int(n_x), n_z=128, px_hid=[128], qz_hid=[128], filters=filters, seq_length=int(seq), nonlinearity=rectify, batchnorm=False, x_dist='gaussian') # Copy script to output folder copy_script(__file__, model) # Get the training functions. f_train, f_test, f_validate, train_args, test_args, validate_args = model.build_model( train_set, test_set) # Update the default function arguments. train_args['inputs']['batchsize'] = 100 train_args['inputs']['learningrate'] = 1e-4 train_args['inputs']['beta1'] = 0.9 train_args['inputs']['beta2'] = 0.999 train_args['inputs']['warmup'] = .5 def custom_evaluation(model, path): plt.clf() f, axarr = plt.subplots(nrows=len(np.unique(y)), ncols=2) z_ = np.empty((0, model.n_z)) y_ = np.empty((0, )) for idx, y_l in enumerate(np.unique(y)): act_idx = test_set[1] == y_l test_act = test_set[0][act_idx[:, 0]] z = model.f_qz(test_act, 1) z_ = np.concatenate((z_, z)) y_ = np.concatenate((y_, np.ones((len(test_act), )) * y_l)) xhat = model.f_px(z, 1) mu = model.f_mu(z, 1) var = np.exp(model.f_var(z, 1)) axarr[idx, 0].plot(test_act[:2].reshape(-1, dim_features), color='red') axarr[idx, 0].plot(xhat[:2].reshape(-1, dim_features), color='blue', linestyle='dotted') axarr[idx, 1].plot(mu[:2].reshape(-1, dim_features), label="mu") axarr[idx, 1].plot(var[:2].reshape(-1, dim_features), label="var") plt.legend() f.set_size_inches(12, 10) f.savefig(path, dpi=100, format='png') plt.close(f) # Plot PCA decomp of Z z_pca = PCA(n_components=2).fit_transform(z_) plt.clf() plt.figure() for c, i in zip(['r', 'b'], set(y_unique)): plt.scatter(z_pca[y_ == i, 0], z_pca[y_ == i, 1], c=c, alpha=0.8) plt.legend() plt.title('PCA of Z') plt.savefig(path.replace('custom_eval_plot', 'pca/z')) plt.close() # Define training loop. Output training evaluations every 1 epoch # and the custom evaluation method every 10 epochs. train = TrainModel(model=model, output_freq=1, pickle_f_custom_freq=10, f_custom_eval=custom_evaluation) train.add_initial_training_notes("Training the rae with bn %s. seed %i." % (str(model.batchnorm), seed)) train.train_model(f_train, train_args, f_test, test_args, f_validate, validate_args, n_train_batches=n_batches, n_epochs=1000, anneal=[("learningrate", 100, 0.75, 3e-5), ("warmup", 5, 0.99, 0.1)]) image_to_movie.create(model.get_root_path() + '/training_custom_evals/', rate=3)
def main(): seed = np.random.randint(1, 2147462579) # def sinus_seq(period, samples, length): # X = np.linspace(-np.pi*(samples/period), np.pi*(samples/period), samples) # X = np.reshape(np.sin(X), (-1, length, 1)) # X += np.random.randn(*X.shape)*0.1 # # X = (X - np.min(X))/(np.max(X) - np.min(X)) # return X, np.ones((samples/length, 1)) # # X1, y1 = sinus_seq(20, 100000, 40) # X2, y2 = sinus_seq(12, 100000, 40) # X3, y3 = sinus_seq(8, 100000, 40) # # X = np.concatenate((X1, X2, X3)).astype('float32') # y = np.concatenate((y1*0, y2*1, y3*2), axis=0).astype('int')[:, 0] # # y_unique = np.unique(y) # y = one_hot(y, len(y_unique)) # num_classes = len(y_unique) # # X_train, X_test, y_train, y_test = train_test_split(X, y, train_size=0.9) # X, y, users, stats = har.load() # n_samples, step = 50, 50 load_data = LoadHAR(add_pitch=False, add_roll=False, add_filter=False, n_samples=n_samples, diff=False, step=step, normalize='segments', comp_magnitude=False, simple_labels=False, common_labels=False) X, y, name, users, stats = load_data.uci_hapt() limited_labels = y < 18 y = y[limited_labels] X = X[limited_labels].astype(np.float32) users = users[limited_labels] # X -= X.mean(axis=0) # Compress labels for idx, label in enumerate(np.unique(y)): if not np.equal(idx, label): y[y == label] = idx y_unique = np.unique(y) num_classes = len(y_unique) y = one_hot(y, num_classes) # Split into train and test stratified by users X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=1000, stratify=users) n_samples = 1001 # Split training into labelled and unlabelled. Optionally stratified by the label X_train_labeled, X_train_unlabeled, y_train_labeled, y_train_unlabeled = \ train_test_split(X_train, y_train, train_size=n_samples, stratify=np.argmax(y_train, axis=1)) # Combine in sets train_set_labeled = (X_train_labeled, y_train_labeled) train_set_unlabeled = (X_train_unlabeled, y_train_unlabeled) test_set = (X_test, y_test) print('Train unlabelled size: ', train_set_unlabeled[0].shape) print('Train labelled size: ', train_set_labeled[0].shape) print('Test size: ', test_set[0].shape) n, n_l, n_c = train_set_unlabeled[ 0].shape # Datapoints in the dataset, input features. n_batches = n / 100 # The number of batches. bs = n / n_batches # The batchsize. # Initialize the auxiliary deep generative model. model = RSDGM(n_c=int(n_c), n_l=int(n_l), n_a=100, n_z=128, n_y=num_classes, qa_hid=[100], qz_hid=[100], qy_hid=[100], px_hid=[128], pa_hid=[100], nonlinearity=rectify, batchnorm=False, x_dist='gaussian') # Copy script to output folder copy_script(__file__, model) # Create output path for PCA plot makedirs(model.get_root_path() + '/training custom evals/pca') # Get the training functions. f_train, f_test, f_validate, train_args, test_args, validate_args = model.build_model( train_set_unlabeled, train_set_labeled, test_set) # Update the default function arguments. train_args['inputs']['batchsize_unlabeled'] = bs train_args['inputs']['batchsize_labeled'] = n_samples train_args['inputs']['beta'] = .1 train_args['inputs']['learningrate'] = 3e-4 train_args['inputs']['beta1'] = 0.9 train_args['inputs']['beta2'] = 0.999 train_args['inputs']['samples'] = 1 train_args['inputs']['warmup'] = 1.1 def custom_evaluation(model, path): # Get model output x_ = test_set[0] y_ = test_set[1] # qy = model.f_qy(x_, 1) qa = model.f_qa(x_, 1) qz = model.f_qz(x_, y_, 1) # pa = model.f_pa(qz, y_, 1) px = model.f_px(qa, qz, y_, 1) px_mu = model.f_mu(qa, qz, y_, 1) px_var = np.exp(model.f_var(qa, qz, y_, 1)) # reduce y to integers y_ = np.argmax(y_, axis=1) plt.clf() f, axarr = plt.subplots(nrows=len(y_unique), ncols=2) for idx, y_l in enumerate(y_unique): l_idx = y_ == y_l axarr[idx, 0].plot(x_[l_idx][:2].reshape(-1, n_c)) axarr[idx, 0].plot(px[l_idx][:2].reshape(-1, n_c), linestyle='dotted') axarr[idx, 1].plot(px_mu[l_idx][:2].reshape(-1, n_c), label="mu") axarr[idx, 1].plot(px_var[l_idx][:2].reshape(-1, n_c), label="var") plt.legend() f.set_size_inches(12, 8) f.savefig(path, dpi=100, format='png') plt.close(f) # Plot PCA decomp z_pca = PCA(n_components=2).fit_transform(qz) a_pca = PCA(n_components=2).fit_transform(qa) palette = itertools.cycle(sns.color_palette()) plt.clf() plt.figure() f, axarr = plt.subplots(ncols=2) for i in set(y_unique): c = next(palette) axarr[0].scatter(z_pca[y_ == i, 0], z_pca[y_ == i, 1], c=c, alpha=0.8) axarr[1].scatter(a_pca[y_ == i, 0], a_pca[y_ == i, 1], c=c, alpha=0.8, label=str(i)) plt.legend() plt.title('PCA of Z and A') f.set_size_inches(10, 6) plt.savefig(path.replace('custom_eval_plot', 'pca/z'), dpi=100, format='png') plt.close() # Define training loop. Output training evaluations every 1 epoch # and the custom evaluation method every 10 epochs. train = TrainModel(model=model, output_freq=1, pickle_f_custom_freq=10, f_custom_eval=custom_evaluation) train.add_initial_training_notes("Training the rae with bn %s. seed %i." % (str(model.batchnorm), seed)) train.train_model(f_train, train_args, f_test, test_args, f_validate, validate_args, n_train_batches=n_batches, n_epochs=1000, anneal=[("learningrate", 100, 0.75, 3e-5), ("warmup", 1, 0.99, 0.1)])
# Combine in sets train_set = (data, []) test_set = (test_data, []) print('Train size: ', train_set[0].shape) print('Test size: ', test_set[0].shape) n, n_l, n_c = train_set[0].shape n_batches = n // batch_size # Initialize recurrent variational autoencoder model = RAE(n_c=int(n_c), n_l=int(n_l), px_hid=[32], enc_rnn=32, dec_rnn=32, nonlinearity=leaky_rectify, batchnorm=False) # Copy model to output folder copy_script(__file__, model) # Get the training functions. f_train, f_test, f_validate, train_args, test_args, validate_args = model.build_model(train_set, test_set) # Update the default function arguments. train_args['inputs']['batchsize'] = batch_size train_args['inputs']['learningrate'] = 0.005 train_args['inputs']['beta1'] = 0.9 train_args['inputs']['beta2'] = 0.999 def custom_evaluation(model, path): # Dump encoder model.save_encoder()
def main(): n_samples, step = 50, 25 load_data = LoadHAR(add_pitch=False, add_roll=False, add_filter=False, n_samples=n_samples, diff=False, step=step, normalize='segments', comp_magnitude=False, simple_labels=False, common_labels=False) X, y, name, users, stats = load_data.uci_hapt() limited_labels = y < 18 y = y[limited_labels] X = X[limited_labels].astype(np.float32) users = users[limited_labels] # Compress labels for idx, label in enumerate(np.unique(y)): if not np.equal(idx, label): y[y == label] = idx y_unique = np.unique(y) y = one_hot(y, len(y_unique)) num_classes = len(y_unique) # Split into train and test stratified by users X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, stratify=np.argmax( y, axis=1), random_state=1) # Combine in sets train_set = (X_train, y_train) test_set = (X_test, y_test) print('Train size: ', train_set[0].shape) print('Test size: ', test_set[0].shape) n, n_l, n_c = train_set[ 0].shape # Datapoints in the dataset, input features. n_batches = n / 100 # The number of batches. bs = n / n_batches # The batchsize. model = CAE(n_in=(int(n_l), int(n_c)), filters=[8, 16, 32, 64, 128], n_hidden=128, n_out=n_samples, trans_func=leaky_rectify, stats=0) # Copy script to output folder copy_script(__file__, model) # Build model f_train, f_test, f_validate, train_args, test_args, validate_args = model.build_model( train_set, test_set) def custom_evaluation(model, path): # Get model output x_ = test_set[0] y_ = test_set[1] xhat = model.f_px(x_) # reduce y to integers y_ = np.argmax(y_, axis=1) plt.clf() f, axarr = plt.subplots(nrows=num_classes, ncols=n_c) for idx, y_l in enumerate(y_unique): l_idx = y_ == y_l for c in range(n_c): axarr[idx, c].plot(x_[l_idx, :, c][:2].reshape(-1), color='red') axarr[idx, c].plot(xhat[l_idx, :, c][:2].reshape(-1), color='blue', linestyle='dotted') f.set_size_inches(12, 3 * num_classes) f.savefig(path, dpi=100, format='png') plt.close(f) train = TrainModel(model=model, output_freq=1, pickle_f_custom_freq=10, f_custom_eval=custom_evaluation) train.pickle = False train.write_to_logger("Normalizing: %s" % load_data.normalize) train.write_to_logger("Simple labels: %s" % load_data.simple_labels) train.write_to_logger("Common labels: %s" % load_data.common_labels) train.write_to_logger("Sequence length: %d" % load_data.n_samples) train.write_to_logger("Step: %d" % load_data.step) train.write_to_logger("Add pitch: %s\nAdd roll: %s" % (load_data.add_pitch, load_data.add_roll)) train.write_to_logger("Only magnitude: %s" % load_data.comp_magnitude) train.write_to_logger("Lowpass: %s" % str(load_data.lowpass)) train.write_to_logger("Add filter separated signals: %s" % load_data.add_filter) train.write_to_logger("Differentiate: %s" % load_data.differentiate) train_args['inputs']['batchsize'] = bs train_args['inputs']['learningrate'] = 1e-3 train_args['inputs']['beta1'] = 0.9 train_args['inputs']['beta2'] = 0.999 train.train_model(f_train, train_args, f_test, test_args, f_validate, validate_args, n_train_batches=int(n_batches), n_epochs=2000, anneal=[("learningrate", 100, 0.75, 3e-5)])
def run_vrae_har(): seed = np.random.randint(1, 2147462579) # def sinus_seq(period, samples, length): # X = np.linspace(-np.pi*(samples/period), np.pi*(samples/period), samples) # X = np.reshape(np.sin(X), (-1, length, 1)) # X += np.random.randn(*X.shape)*0.1 # # X = (X - np.min(X))/(np.max(X) - np.min(X)) # return X, np.ones((samples/length, 1)) # # X1, y1 = sinus_seq(20, 100000, 40) # X2, y2 = sinus_seq(12, 100000, 40) # X3, y3 = sinus_seq(8, 100000, 40) # # X = np.concatenate((X1, X2, X3)).astype('float32') # y = np.concatenate((y1*0, y2*1, y3*2), axis=0).astype('int')[:, 0] # y_unique = np.unique(list(y)) # # y = one_hot(y, len(y_unique)) # # dim_samples, dim_sequence, dim_features = X.shape # X_train, X_test, y_train, y_test = train_test_split(X, y, train_size=0.8) ## # HAR data # X, y, users, stats = har.load() n_samples, step = 50, 25 load_data = LoadHAR(add_pitch=False, add_roll=False, add_filter=False, n_samples=n_samples, diff=False, step=step, normalize='segments', comp_magnitude=False, simple_labels=False, common_labels=False) X, y, name, users, stats = load_data.uci_hapt() limited_labels = y < 18 y = y[limited_labels] X = X[limited_labels].astype(np.float32) users = users[limited_labels] # X -= X.mean(axis=0) # Compress labels for idx, label in enumerate(np.unique(y)): if not np.equal(idx, label): y[y == label] = idx y_unique = np.unique(y) y = one_hot(y, len(y_unique)) dim_samples, dim_sequence, n_c = X.shape num_classes = len(y_unique) # Split into train and test stratified by users X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, stratify=np.argmax( y, axis=1), random_state=1) ## # Combine in sets train_set = (X_train, y_train) test_set = (X_test, y_test) print('Train size: ', train_set[0].shape) print('Test size: ', test_set[0].shape) n, n_l, n_c = train_set[ 0].shape # Datapoints in the dataset, input features. n_batches = int(n / 100) # The number of batches. bs = n / n_batches # The batchsize. # Initialize the auxiliary deep generative model. model = RVAE(n_c=n_c, n_z=256, qz_hid=[256, 256], px_hid=[256, 256], enc_rnn=256, dec_rnn=256, n_l=n_l, nonlinearity=rectify, batchnorm=False, x_dist='gaussian', px_nonlinearity=None) # Copy script to output folder copy_script(__file__, model) # Create output path for PCA plot makedirs(model.get_root_path() + '/training custom evals/pca') # Get the training functions. f_train, f_test, f_validate, train_args, test_args, validate_args = model.build_model( train_set, test_set) # Update the default function arguments. train_args['inputs']['batchsize'] = bs train_args['inputs']['learningrate'] = 1e-3 train_args['inputs']['beta1'] = 0.9 train_args['inputs']['beta2'] = 0.999 train_args['inputs']['samples'] = 1 train_args['inputs']['warmup'] = 1.1 def custom_evaluation(model, path): # Get model output x_ = test_set[0] y_ = test_set[1] qz = model.f_qz(x_, 1) px = model.f_px(x_, qz, 1) px_mu = model.f_mu(x_, qz, 1) px_var = np.exp(model.f_var(x_, qz, 1)) # reduce y to integers y_ = np.argmax(y_, axis=1) plt.clf() f, axarr = plt.subplots(nrows=num_classes, ncols=n_c * 2) for idx, y_l in enumerate(y_unique): l_idx = y_ == y_l for c in range(n_c): axarr[idx, c * 2].plot(x_[l_idx, :, c][:2].reshape(-1)) axarr[idx, c * 2].plot(px[l_idx, :, c][:2].reshape(-1), linestyle='dotted') axarr[idx, c * 2 + 1].plot(px_mu[l_idx, :, c][:2].reshape(-1), label="mu") axarr[idx, c * 2 + 1].plot(px_var[l_idx, :, c][:2].reshape(-1), label="var") plt.legend() f.set_size_inches(20, num_classes * 3) f.savefig(path, dpi=100, format='png') plt.close(f) # Plot PCA decomp z_pca = PCA(n_components=2).fit_transform(qz) palette = itertools.cycle(sns.color_palette()) plt.clf() plt.figure() for i in set(y_unique): plt.scatter(z_pca[y_ == i, 0], z_pca[y_ == i, 1], c=next(palette), alpha=0.8) plt.legend() plt.title('PCA of Z') plt.savefig(path.replace('custom_eval_plot', 'pca/z')) plt.close() def anneal_func(input): return input - 0.01 # Define training loop. Output training evaluations every 1 epoch # and the custom evaluation method every 10 epochs. train = TrainModel(model=model, output_freq=1, pickle_f_custom_freq=10, f_custom_eval=custom_evaluation) train.add_initial_training_notes("Training the vrae with bn %s. seed %i." % (str(model.batchnorm), seed)) train.train_model( f_train, train_args, f_test, test_args, f_validate, validate_args, n_train_batches=n_batches, n_epochs=1000, # Any symbolic model variable can be annealed during # training with a tuple of (var_name, every, scale constant, minimum value). anneal=[("learningrate", 100, 0.75, 3e-5), ("warmup", 1, anneal_func, 0.1)]) image_to_movie.create(model.get_root_path() + '/training custom evals', rate=3)