print("Test Latent:", Z1_test.shape, Z2_test.shape, Z3_test.shape) y_pred = np.argmax(y_pred_proba, axis=-1) evaluate(y_true=X_test_true, y_pred_proba=y_pred_proba, labels=labels, title="Test set (Deep prediction)", path=os.path.join(EXP_DIR, 'test_deep.pdf')) # ====== make a streamline classifier ====== # # training PLDA Z3_train, y_train = make_dnn_prediction(f_z3, X=train, title="TRAIN") print("Z3_train:", Z3_train.shape, y_train.shape) Z3_valid, y_valid = make_dnn_prediction(f_z3, X=valid, title="VALID") print("Z3_valid:", Z3_valid.shape, y_valid.shape) plda = PLDA(n_phi=200, random_state=K.get_rng().randint(10e8), n_iter=12, labels=labels, verbose=0) plda.fit(np.concatenate([Z3_train, Z3_valid], axis=0), np.concatenate([y_train, y_valid], axis=0)) y_pred_log_proba = plda.predict_log_proba(Z3_test) evaluate(y_true=X_test_true, y_pred_log_proba=y_pred_log_proba, labels=labels, title="Test set (PLDA - Latent prediction)", path=os.path.join(EXP_DIR, 'test_latent.pdf')) # ====== visualize ====== # visualize_latent_space(X_org=X_test_data, X_latent=Z1_test, name=X_test_name, labels=X_test_true,
X_score_tsne_pca = tsne_pca.fit_transform(X_score_pca) # ====== tsne ====== # tsne = TSNE(n_components=NUM_DIM, perplexity=30.0, learning_rate=200.0, n_iter=1000, random_state=SEED) X_train_tsne = tsne.fit_transform(X_train) X_score_tsne = tsne.fit_transform(X_score) # ====== lda ====== # lda = LinearDiscriminantAnalysis(n_components=NUM_DIM) lda.fit(X_train, y_train) X_train_lda = lda.transform(X_train) X_score_lda = lda.transform(X_score) # ====== plda ====== # plda = PLDA(n_phi=NUM_DIM, random_state=SEED) plda.fit(X_train, y_train) X_train_plda = plda.predict_log_proba(X_train) X_score_plda = plda.predict_log_proba(X_score) # ====== gmm ====== # gmm = GaussianMixture(n_components=NUM_DIM, max_iter=100, covariance_type='full', random_state=SEED) gmm.fit(X_train) X_train_gmm = gmm._estimate_weighted_log_prob(X_train) X_score_gmm = gmm._estimate_weighted_log_prob(X_score) # ====== rbm ====== # rbm = BernoulliRBM(n_components=NUM_DIM, batch_size=8, learning_rate=0.0008,
def fast_pca(*x, n_components=None, algo='rpca', y=None, batch_size=1024, return_model=False, random_state=1234): """ A shortcut for many different PCA algorithms Parameters ---------- x : {list, tuple} list of matrices for transformation, the first matrix will be used for training n_components : {None, int} number of PCA components algo : {'pca', 'ipca', 'ppca', 'sppca', 'plda', 'rpca'} different PCA algorithm: 'ipca' - IncrementalPCA, 'ppca' - Probabilistic PCA, 'sppca' - Supervised Probabilistic PCA, 'plda' - Probabilistic LDA, 'rpca' - randomized PCA using randomized SVD y : {numpy.ndarray, None} required for labels in case of `sppca` batch_size : int (default: 1024) batch size, only used for IncrementalPCA return_model : bool (default: False) if True, return the trained PCA model as the FIRST return """ batch_size = int(batch_size) algo = str(algo).lower() if algo not in ('pca', 'ipca', 'ppca', 'sppca', 'plda', 'rpca'): raise ValueError( "`algo` must be one of the following: 'pca', " "'ppca', 'plda', 'sppca', or 'rpca'; but given: '%s'" % algo) if algo in ('sppca', 'plda') and y is None: raise RuntimeError("`y` must be not None if `algo='sppca'`") x = flatten_list(x, level=None) x = [i[:] if i.__class__.__name__ == 'MmapData' else i for i in x] # ====== check input ====== # x_train = x[0] x_test = x[1:] input_shape = None if x_train.ndim > 2: # only 2D for PCA input_shape = (-1, ) + x_train.shape[1:] new_shape = (-1, np.prod(input_shape[1:])) x_train = np.reshape(x_train, new_shape) x_test = [np.reshape(x, new_shape) for x in x_test] if n_components is not None: # no need to reshape back input_shape = None # ====== train PCA ====== # if algo == 'sppca': pca = SupervisedPPCA(n_components=n_components, random_state=random_state) pca.fit(x_train, y) elif algo == 'plda': from odin.ml import PLDA pca = PLDA(n_phi=n_components, random_state=random_state) pca.fit(x_train, y) elif algo == 'pca': pca = PCA(n_components=n_components, random_state=random_state) pca.fit(x_train) elif algo == 'rpca': # we copy the implementation of RandomizedPCA because # it is significantly faster than PCA(svd_solver='randomize') pca = RandomizedPCA(n_components=n_components, iterated_power=2, random_state=random_state) pca.fit(x_train) elif algo == 'ipca': pca = IncrementalPCA(n_components=n_components, batch_size=batch_size) prog = Progbar(target=x_train.shape[0], print_report=False, print_summary=False, name="Fitting PCA") for start, end in batching(batch_size=batch_size, n=x_train.shape[0], seed=1234): pca.partial_fit(x_train[start:end], check_input=False) prog.add(end - start) elif algo == 'ppca': pca = PPCA(n_components=n_components, random_state=random_state) pca.fit(x_train) # ====== transform ====== # x_train = pca.transform(x_train) x_test = [pca.transform(x) for x in x_test] # reshape back to original shape if necessary if input_shape is not None: x_train = np.reshape(x_train, input_shape) x_test = [np.reshape(x, input_shape) for x in x_test] # return the results if len(x_test) == 0: return x_train if not return_model else (pca, x_train) return tuple([x_train] + x_test) if not return_model else tuple([pca, x_train] + x_test)
tsne_pca = TSNE(n_components=NUM_DIM, perplexity=30.0, learning_rate=200.0, n_iter=1000, random_state=SEED) X_train_tsne_pca = tsne_pca.fit_transform(X_train_pca) X_score_tsne_pca = tsne_pca.fit_transform(X_score_pca) # ====== tsne ====== # tsne = TSNE(n_components=NUM_DIM, perplexity=30.0, learning_rate=200.0, n_iter=1000, random_state=SEED) X_train_tsne = tsne.fit_transform(X_train) X_score_tsne = tsne.fit_transform(X_score) # ====== lda ====== # lda = LinearDiscriminantAnalysis(n_components=NUM_DIM) lda.fit(X_train, y_train) X_train_lda = lda.transform(X_train) X_score_lda = lda.transform(X_score) # ====== plda ====== # plda = PLDA(n_phi=NUM_DIM, random_state=SEED) plda.fit(X_train, y_train) X_train_plda = plda.predict_log_proba(X_train) X_score_plda = plda.predict_log_proba(X_score) # ====== gmm ====== # gmm = GaussianMixture(n_components=NUM_DIM, max_iter=100, covariance_type='full', random_state=SEED) gmm.fit(X_train) X_train_gmm = gmm._estimate_weighted_log_prob(X_train) X_score_gmm = gmm._estimate_weighted_log_prob(X_score) # ====== rbm ====== # rbm = BernoulliRBM(n_components=NUM_DIM, batch_size=8, learning_rate=0.0008, n_iter=8, verbose=2, random_state=SEED) rbm.fit(X_train) X_train_rbm = rbm.transform(X_train) X_score_rbm = rbm.transform(X_score)
# Prediction # =========================================================================== y_pred_proba, Z1_test, Z2_test, Z3_test = make_dnn_prediction( functions=[f_pred_proba, f_z1, f_z2, f_z3], X=X_test_data, title='TEST') print("Test Latent:", Z1_test.shape, Z2_test.shape, Z3_test.shape) y_pred = np.argmax(y_pred_proba, axis=-1) evaluate(y_true=X_test_true, y_pred_proba=y_pred_proba, labels=labels, title="Test set (Deep prediction)", path=os.path.join(EXP_DIR, 'test_deep.pdf')) # ====== make a streamline classifier ====== # # training PLDA Z3_train, y_train = make_dnn_prediction(f_z3, X=train, title="TRAIN") print("Z3_train:", Z3_train.shape, y_train.shape) Z3_valid, y_valid = make_dnn_prediction(f_z3, X=valid, title="VALID") print("Z3_valid:", Z3_valid.shape, y_valid.shape) plda = PLDA(n_phi=200, random_state=K.get_rng().randint(10e8), n_iter=12, labels=labels, verbose=0) plda.fit(np.concatenate([Z3_train, Z3_valid], axis=0), np.concatenate([y_train, y_valid], axis=0)) y_pred_log_proba = plda.predict_log_proba(Z3_test) evaluate(y_true=X_test_true, y_pred_log_proba=y_pred_log_proba, labels=labels, title="Test set (PLDA - Latent prediction)", path=os.path.join(EXP_DIR, 'test_latent.pdf')) # ====== visualize ====== # visualize_latent_space(X_org=X_test_data, X_latent=Z1_test, name=X_test_name, labels=X_test_true, title="latent1") visualize_latent_space(X_org=X_test_data, X_latent=Z2_test, name=X_test_name, labels=X_test_true, title="latent2") V.plot_save(os.path.join(EXP_DIR, 'latent.pdf'))
# =========================================================================== # Training the PLDA # =========================================================================== # ====== training the LDA ====== # if N_LDA > 0: print(" Fitting LDA ...") lda = LinearDiscriminantAnalysis(n_components=N_LDA) X_backend = lda.fit_transform(X=X_backend, y=y_backend) lda_transform = lda.transform else: lda_transform = lambda x: x # ====== training the PLDA ====== # plda = PLDA(n_phi=N_PLDA, centering=True, wccn=True, unit_length=True, n_iter=20, random_state=Config.SUPER_SEED, verbose=2 if PLDA_SHOW_LLK else 1) if PLDA_MAXIMUM_LIKELIHOOD: print(" Fitting PLDA maximum likelihood ...") plda.fit_maximum_likelihood(X=lda_transform(X_backend), y=y_backend) plda.fit(X=lda_transform(X_backend), y=y_backend) # =========================================================================== # Now scoring # =========================================================================== for dsname, scores in sorted(all_vectors.items(), key=lambda x: x[0]): # ====== skip non scoring dataset ====== # if dsname not in SCORING_DATASETS: continue # ====== proceed ====== #