def best_lda_cluster_spam(self): dh = data_helper() dh = data_helper() X_train, X_test, y_train, y_test = dh.get_spam_data_lda_best() scl = RobustScaler() X_train_scl = scl.fit_transform(X_train) X_test_scl = scl.transform(X_test) ## ## K-Means ## km = KMeans(n_clusters=4, algorithm='full') X_train_transformed = km.fit_transform(X_train_scl) X_test_transformed = km.transform(X_test_scl) # save filename = './' + self.save_dir + '/spam_kmeans_lda_x_train.txt' pd.DataFrame(X_train_transformed).to_csv(filename, header=False, index=False) filename = './' + self.save_dir + '/spam_kmeans_lda_x_test.txt' pd.DataFrame(X_test_transformed).to_csv(filename, header=False, index=False) filename = './' + self.save_dir + '/spam_kmeans_lda_y_train.txt' pd.DataFrame(y_train).to_csv(filename, header=False, index=False) filename = './' + self.save_dir + '/spam_kmeans_lda_y_test.txt' pd.DataFrame(y_test).to_csv(filename, header=False, index=False) ## ## GMM ## gmm = GaussianMixture(n_components=4, covariance_type='full') X_train_transformed = km.fit_transform(X_train_scl) X_test_transformed = km.transform(X_test_scl) # save filename = './' + self.save_dir + '/spam_gmm_lda_x_train.txt' pd.DataFrame(X_train_transformed).to_csv(filename, header=False, index=False) filename = './' + self.save_dir + '/spam_gmm_lda_x_test.txt' pd.DataFrame(X_test_transformed).to_csv(filename, header=False, index=False) filename = './' + self.save_dir + '/spam_gmm_lda_y_train.txt' pd.DataFrame(y_train).to_csv(filename, header=False, index=False) filename = './' + self.save_dir + '/spam_gmm_lda_y_test.txt' pd.DataFrame(y_test).to_csv(filename, header=False, index=False)
def best_lda_cluster_wine(self): dh = data_helper() dh = data_helper() X_train, X_test, y_train, y_test = dh.get_wine_data_lda_best() scl = RobustScaler() X_train_scl = scl.fit_transform(X_train) X_test_scl = scl.transform(X_test) ## ## K-Means ## km = KMeans(n_clusters=4, algorithm='full') X_train_transformed = km.fit_transform(X_train_scl) X_test_transformed = km.transform(X_test_scl) # save filename = './' + self.save_dir + '/wine_kmeans_lda_x_train.txt' pd.DataFrame(X_train_transformed).to_csv(filename, header=False, index=False) filename = './' + self.save_dir + '/wine_kmeans_lda_x_test.txt' pd.DataFrame(X_test_transformed).to_csv(filename, header=False, index=False) filename = './' + self.save_dir + '/wine_kmeans_lda_y_train.txt' pd.DataFrame(y_train).to_csv(filename, header=False, index=False) filename = './' + self.save_dir + '/wine_kmeans_lda_y_test.txt' pd.DataFrame(y_test).to_csv(filename, header=False, index=False) ## ## GMM ## gmm = GaussianMixture(n_components=4, covariance_type='full') X_train_transformed = km.fit_transform(X_train_scl) X_test_transformed = km.transform(X_test_scl) # save filename = './' + self.save_dir + '/wine_gmm_lda_x_train.txt' pd.DataFrame(X_train_transformed).to_csv(filename, header=False, index=False) filename = './' + self.save_dir + '/wine_gmm_lda_x_test.txt' pd.DataFrame(X_test_transformed).to_csv(filename, header=False, index=False) filename = './' + self.save_dir + '/wine_gmm_lda_y_train.txt' pd.DataFrame(y_train).to_csv(filename, header=False, index=False) filename = './' + self.save_dir + '/wine_gmm_lda_y_test.txt' pd.DataFrame(y_test).to_csv(filename, header=False, index=False)
def predict_on_line_demo(CHECKPOINT_PATH, line=u'我最近好累啊\t要好好休息,知道吗\t知道你过得不好我就放心了'): config = resolve_filename(CHECKPOINT_PATH) dh = data_helper(config) config_network = { 'HIDDEN_SIZE': 256, 'NUM_LAYERS': 2, 'SRC_VOCAB_SIZE': dh.vocab_size, 'KEEP_PROB': 1, 'MAX_GRAD_NORM': 5, 'word_embedding_file': 'word_dic_jieba_embedding.pk' if config['seg'] == 'jieba' else 'word_dic_nioseg_embedding.pk', 'max_len': dh.max_len } with tf.Session() as sess: train_model = ADEM_model(config, config_network) tf.global_variables_initializer().run() saver = tf.train.Saver() saver.restore(sess, CHECKPOINT_PATH) train_model.set_word_dic('word_dic_jieba') print(train_model.predict_on_line(sess=sess, line=line))
def best_rp_nba(self): dh = data_helper() X_train, X_test, y_train, y_test = dh.get_nba_data() scl = RobustScaler() X_train_scl = scl.fit_transform(X_train) X_test_scl = scl.transform(X_test) rp = GaussianRandomProjection(n_components=X_train_scl.shape[1]) X_train_transformed = rp.fit_transform(X_train_scl, y_train) X_test_transformed = rp.transform(X_test_scl) ## top 2 kurt = kurtosis(X_train_transformed) i = kurt.argsort()[::-1] X_train_transformed_sorted = X_train_transformed[:, i] X_train_transformed = X_train_transformed_sorted[:,0:2] kurt = kurtosis(X_test_transformed) i = kurt.argsort()[::-1] X_test_transformed_sorted = X_test_transformed[:, i] X_test_transformed = X_test_transformed_sorted[:,0:2] # save filename = './' + self.save_dir + '/nba_rp_x_train.txt' pd.DataFrame(X_train_transformed).to_csv(filename, header=False, index=False) filename = './' + self.save_dir + '/nba_rp_x_test.txt' pd.DataFrame(X_test_transformed).to_csv(filename, header=False, index=False) filename = './' + self.save_dir + '/nba_rp_y_train.txt' pd.DataFrame(y_train).to_csv(filename, header=False, index=False) filename = './' + self.save_dir + '/nba_rp_y_test.txt' pd.DataFrame(y_test).to_csv(filename, header=False, index=False)
def best_lda_pima(self): dh = data_helper() X_train, X_test, y_train, y_test = dh.get_pima_data() scl = RobustScaler() X_train_scl = scl.fit_transform(X_train) X_test_scl = scl.transform(X_test) lda = LinearDiscriminantAnalysis(n_components=2) X_train_transformed = lda.fit_transform(X_train_scl, y_train) X_test_transformed = lda.transform(X_test_scl) # save filename = './' + self.save_dir + '/pima_lda_x_train.txt' pd.DataFrame(X_train_transformed).to_csv(filename, header=False, index=False) filename = './' + self.save_dir + '/pima_lda_x_test.txt' pd.DataFrame(X_test_transformed).to_csv(filename, header=False, index=False) filename = './' + self.save_dir + '/pima_lda_y_train.txt' pd.DataFrame(y_train).to_csv(filename, header=False, index=False) filename = './' + self.save_dir + '/pima_lda_y_test.txt' pd.DataFrame(y_test).to_csv(filename, header=False, index=False)
def nn_rp_cluster_wine(self): dh = data_helper() X_train, X_test, y_train, y_test = dh.get_wine_data_kmeans_rp_best() self.part4.nn_analysis(X_train, X_test, y_train, y_test, 'Wine', 'Neural Network RP K-Means') X_train, X_test, y_train, y_test = dh.get_wine_data_gmm_rp_best() self.part4.nn_analysis(X_train, X_test, y_train, y_test, 'Wine', 'Neural Network RP GMM')
def best_ica_wine(self): dh = data_helper() X_train, X_test, y_train, y_test = dh.get_wine_data() scl = RobustScaler() X_train_scl = scl.fit_transform(X_train) X_test_scl = scl.transform(X_test) ica = FastICA(n_components=X_train_scl.shape[1]) X_train_transformed = ica.fit_transform(X_train_scl, y_train) X_test_transformed = ica.transform(X_test_scl) ## top 2 kurt = kurtosis(X_train_transformed) i = kurt.argsort()[::-1] X_train_transformed_sorted = X_train_transformed[:, i] X_train_transformed = X_train_transformed_sorted[:,0:2] kurt = kurtosis(X_test_transformed) i = kurt.argsort()[::-1] X_test_transformed_sorted = X_test_transformed[:, i] X_test_transformed = X_test_transformed_sorted[:,0:2] # save filename = './' + self.save_dir + '/wine_ica_x_train.txt' pd.DataFrame(X_train_transformed).to_csv(filename, header=False, index=False) filename = './' + self.save_dir + '/wine_ica_x_test.txt' pd.DataFrame(X_test_transformed).to_csv(filename, header=False, index=False) filename = './' + self.save_dir + '/wine_ica_y_train.txt' pd.DataFrame(y_train).to_csv(filename, header=False, index=False) filename = './' + self.save_dir + '/wine_ica_y_test.txt' pd.DataFrame(y_test).to_csv(filename, header=False, index=False)
def best_pca_spam(self): dh = data_helper() X_train, X_test, y_train, y_test = dh.get_spam_data() scl = RobustScaler() X_train_scl = scl.fit_transform(X_train) X_test_scl = scl.transform(X_test) pca = PCA(n_components=3) X_train_transformed = pca.fit_transform(X_train_scl, y_train) X_test_transformed = pca.transform(X_test_scl) # save filename = './' + self.save_dir + '/spam_pca_x_train.txt' pd.DataFrame(X_train_transformed).to_csv(filename, header=False, index=False) filename = './' + self.save_dir + '/spam_pca_x_test.txt' pd.DataFrame(X_test_transformed).to_csv(filename, header=False, index=False) filename = './' + self.save_dir + '/spam_pca_y_train.txt' pd.DataFrame(y_train).to_csv(filename, header=False, index=False) filename = './' + self.save_dir + '/spam_pca_y_test.txt' pd.DataFrame(y_test).to_csv(filename, header=False, index=False)
def nn_wine_orig(self): dh = data_helper() X_train, X_test, y_train, y_test = dh.get_wine_data() scl = RobustScaler() X_train_scl = scl.fit_transform(X_train) X_test_scl = scl.transform(X_test) self.part4.nn_analysis(X_train_scl, X_test_scl, y_train, y_test, 'Wine', 'Neural Network Original')
def nn_ica_cluster_spam(self): dh = data_helper() X_train, X_test, y_train, y_test = dh.get_spam_data_kmeans_ica_best() self.part4.nn_analysis(X_train, X_test, y_train, y_test, 'Spam', 'Neural Network ICA K-Means') X_train, X_test, y_train, y_test = dh.get_spam_data_gmm_ica_best() self.part4.nn_analysis(X_train, X_test, y_train, y_test, 'Spam', 'Neural Network ICA GMM')
def nba_cluster_plots(self): dh = data_helper() X_train, X_test, y_train, y_test = dh.get_nba_data() x_col_names = ['Shot Distance', 'Closest Defender Distance', 'Number Dribbles'] df = pd.DataFrame(X_train) df.columns = x_col_names self.cluster_plot(df, 5, 'KMeans', 'NBA') self.cluster_plot(df, 10, 'GaussianMixture', 'NBA') self.cluster_3d_plot(df, 5, 'KMeans', 'NBA') self.cluster_3d_plot(df, 10, 'GaussianMixture', 'NBA')
def wine_cluster_plots(self): dh = data_helper() X_train, X_test, y_train, y_test = dh.get_wine_data() x_col_names = ['Alcohol', 'Volatile Acidity', 'Sulphates', 'pH'] df = pd.DataFrame(X_train) df.columns = x_col_names self.cluster_plot(df, 5, 'KMeans', 'Wine') self.cluster_plot(df, 5, 'GaussianMixture', 'Wine') self.cluster_plot(df, 4, 'GaussianMixture', 'Wine') self.cluster_3d_plot(df, 5, 'KMeans', 'Wine') self.cluster_3d_plot(df, 5, 'GaussianMixture', 'Wine') self.cluster_3d_plot(df, 4, 'GaussianMixture', 'Wine')
def predict_on_file_demo(CHECKPOINT_PATH, file_list): config = resolve_filename(CHECKPOINT_PATH) dh = data_helper(config) config_network = { 'HIDDEN_SIZE': 256, 'NUM_LAYERS': 2, 'SRC_VOCAB_SIZE': dh.vocab_size, 'KEEP_PROB': 1, 'MAX_GRAD_NORM': 5, 'word_embedding_file': 'word_dic_jieba_embedding.pk' if config['seg'] == 'jieba' else 'word_dic_nioseg_embedding.pk', 'max_len': dh.max_len } with tf.Session() as sess: train_model = ADEM_model(config, config_network) tf.global_variables_initializer().run() saver = tf.train.Saver() saver.restore(sess, CHECKPOINT_PATH) word_dic = config['seg'] if not config['seg'] == 'nio' else 'nioseg' for file in file_list: context_input, refrence_input, model_input, context_sequence_length, \ refrence_sequence_length, model_sequence_length = dh.get_specific_data(file + '_idx_' + word_dic) predict_score = train_model.predict_on_batch( sess, feed_dict={ train_model.context_input: context_input, train_model.context_sequence_length: context_sequence_length, train_model.model_response_input: model_input, train_model.refrence_response_input: refrence_input, train_model.model_sequence_length: model_sequence_length, train_model.refrence_sequence_length: refrence_sequence_length, }) std_score = np.zeros(len(predict_score)) print(RMSE(predict_score, std_score)) predict_score = np.reshape(predict_score, [len(predict_score)]) write_in_file(file, predict_score, word_dic)
def nba_cluster_plots(self): dh = data_helper() X_train, X_test, y_train, y_test = dh.get_nba_data_pca_best() df = pd.DataFrame(X_train) self.part1.cluster_plot(df, 3, 'KMeans', 'NBA', 'K-Means PCA') self.part1.cluster_plot(df, 8, 'GaussianMixture', 'NBA', 'GMM PCA') if df.shape[1] >= 3: self.part1.cluster_3d_plot(df, 3, 'KMeans', 'NBA', 'K-Means PCA') self.part1.cluster_3d_plot(df, 8, 'GaussianMixture', 'NBA', 'GMM PCA') X_train, X_test, y_train, y_test = dh.get_nba_data_ica_best() df = pd.DataFrame(X_train) self.part1.cluster_plot(df, 3, 'KMeans', 'NBA', 'K-Means ICA') self.part1.cluster_plot(df, 5, 'GaussianMixture', 'NBA', 'GMM ICA') if df.shape[1] >= 3: self.part1.cluster_3d_plot(df, 3, 'KMeans', 'NBA', 'K-Means ICA') self.part1.cluster_3d_plot(df, 5, 'GaussianMixture', 'NBA', 'GMM ICA') X_train, X_test, y_train, y_test = dh.get_nba_data_lda_best() df = pd.DataFrame(X_train) self.part1.cluster_plot(df, 4, 'KMeans', 'NBA', 'K-Means LDA') self.part1.cluster_plot(df, 3, 'GaussianMixture', 'NBA', 'GMM LDA') if df.shape[1] >= 3: self.part1.cluster_3d_plot(df, 4, 'KMeans', 'NBA', 'K-Means LDA') self.part1.cluster_3d_plot(df, 3, 'GaussianMixture', 'NBA', 'GMM LDA') X_train, X_test, y_train, y_test = dh.get_nba_data_rp_best() df = pd.DataFrame(X_train) self.part1.cluster_plot(df, 4, 'KMeans', 'NBA', 'K-Means RP') self.part1.cluster_plot(df, 5, 'GaussianMixture', 'NBA', 'GMM RP') if df.shape[1] >= 3: self.part1.cluster_3d_plot(df, 4, 'KMeans', 'NBA', 'K-Means RP') self.part1.cluster_3d_plot(df, 5, 'GaussianMixture', 'NBA', 'GMM RP')
def nba_cluster_plots(self): dh = data_helper() X_train, X_test, y_train, y_test = dh.get_nba_data_pca_best() df = pd.DataFrame(X_train) self.part1.cluster_plot(df, 3, 'KMeans', 'NBA', 'K-Means PCA') self.part1.cluster_plot(df, 8, 'GaussianMixture', 'NBA', 'GMM PCA') if df.shape[1] >=3: self.part1.cluster_3d_plot(df, 3, 'KMeans', 'NBA', 'K-Means PCA') self.part1.cluster_3d_plot(df, 8, 'GaussianMixture', 'NBA', 'GMM PCA') X_train, X_test, y_train, y_test = dh.get_nba_data_ica_best() df = pd.DataFrame(X_train) self.part1.cluster_plot(df, 3, 'KMeans', 'NBA', 'K-Means ICA') self.part1.cluster_plot(df, 5, 'GaussianMixture', 'NBA', 'GMM ICA') if df.shape[1] >=3: self.part1.cluster_3d_plot(df, 3, 'KMeans', 'NBA', 'K-Means ICA') self.part1.cluster_3d_plot(df, 5, 'GaussianMixture', 'NBA', 'GMM ICA') X_train, X_test, y_train, y_test = dh.get_nba_data_lda_best() df = pd.DataFrame(X_train) self.part1.cluster_plot(df, 4, 'KMeans', 'NBA', 'K-Means LDA') self.part1.cluster_plot(df, 3, 'GaussianMixture', 'NBA', 'GMM LDA') if df.shape[1] >=3: self.part1.cluster_3d_plot(df, 4, 'KMeans', 'NBA', 'K-Means LDA') self.part1.cluster_3d_plot(df, 3, 'GaussianMixture', 'NBA', 'GMM LDA') X_train, X_test, y_train, y_test = dh.get_nba_data_rp_best() df = pd.DataFrame(X_train) self.part1.cluster_plot(df, 4, 'KMeans', 'NBA', 'K-Means RP') self.part1.cluster_plot(df, 5, 'GaussianMixture', 'NBA', 'GMM RP') if df.shape[1] >=3: self.part1.cluster_3d_plot(df, 4, 'KMeans', 'NBA', 'K-Means RP') self.part1.cluster_3d_plot(df, 5, 'GaussianMixture', 'NBA', 'GMM RP')
def best_rp_pima(self): dh = data_helper() X_train, X_test, y_train, y_test = dh.get_pima_data() scl = RobustScaler() X_train_scl = scl.fit_transform(X_train) X_test_scl = scl.transform(X_test) rp = GaussianRandomProjection(n_components=X_train_scl.shape[1]) X_train_transformed = rp.fit_transform(X_train_scl, y_train) X_test_transformed = rp.transform(X_test_scl) ## top 2 kurt = kurtosis(X_train_transformed) i = kurt.argsort()[::-1] X_train_transformed_sorted = X_train_transformed[:, i] X_train_transformed = X_train_transformed_sorted[:, 0:2] kurt = kurtosis(X_test_transformed) i = kurt.argsort()[::-1] X_test_transformed_sorted = X_test_transformed[:, i] X_test_transformed = X_test_transformed_sorted[:, 0:2] # save filename = './' + self.save_dir + '/pima_rp_x_train.txt' pd.DataFrame(X_train_transformed).to_csv(filename, header=False, index=False) filename = './' + self.save_dir + '/pima_rp_x_test.txt' pd.DataFrame(X_test_transformed).to_csv(filename, header=False, index=False) filename = './' + self.save_dir + '/pima_rp_y_train.txt' pd.DataFrame(y_train).to_csv(filename, header=False, index=False) filename = './' + self.save_dir + '/pima_rp_y_test.txt' pd.DataFrame(y_test).to_csv(filename, header=False, index=False)
def best_ica_spam(self): dh = data_helper() X_train, X_test, y_train, y_test = dh.get_spam_data() scl = RobustScaler() X_train_scl = scl.fit_transform(X_train) X_test_scl = scl.transform(X_test) ica = FastICA(n_components=X_train_scl.shape[1]) X_train_transformed = ica.fit_transform(X_train_scl, y_train) X_test_transformed = ica.transform(X_test_scl) ## top 2 kurt = kurtosis(X_train_transformed) i = kurt.argsort()[::-1] X_train_transformed_sorted = X_train_transformed[:, i] X_train_transformed = X_train_transformed_sorted[:, 0:2] kurt = kurtosis(X_test_transformed) i = kurt.argsort()[::-1] X_test_transformed_sorted = X_test_transformed[:, i] X_test_transformed = X_test_transformed_sorted[:, 0:2] # save filename = './' + self.save_dir + '/spam_ica_x_train.txt' pd.DataFrame(X_train_transformed).to_csv(filename, header=False, index=False) filename = './' + self.save_dir + '/spam_ica_x_test.txt' pd.DataFrame(X_test_transformed).to_csv(filename, header=False, index=False) filename = './' + self.save_dir + '/spam_ica_y_train.txt' pd.DataFrame(y_train).to_csv(filename, header=False, index=False) filename = './' + self.save_dir + '/spam_ica_y_test.txt' pd.DataFrame(y_test).to_csv(filename, header=False, index=False)
def best_lda_nba(self): dh = data_helper() X_train, X_test, y_train, y_test = dh.get_nba_data() scl = RobustScaler() X_train_scl = scl.fit_transform(X_train) X_test_scl = scl.transform(X_test) lda = LinearDiscriminantAnalysis(n_components=2) X_train_transformed = lda.fit_transform(X_train_scl, y_train) X_test_transformed = lda.transform(X_test_scl) # save filename = './' + self.save_dir + '/nba_lda_x_train.txt' pd.DataFrame(X_train_transformed).to_csv(filename, header=False, index=False) filename = './' + self.save_dir + '/nba_lda_x_test.txt' pd.DataFrame(X_test_transformed).to_csv(filename, header=False, index=False) filename = './' + self.save_dir + '/nba_lda_y_train.txt' pd.DataFrame(y_train).to_csv(filename, header=False, index=False) filename = './' + self.save_dir + '/nba_lda_y_test.txt' pd.DataFrame(y_test).to_csv(filename, header=False, index=False)
def best_pca_wine(self): dh = data_helper() X_train, X_test, y_train, y_test = dh.get_wine_data() scl = RobustScaler() X_train_scl = scl.fit_transform(X_train) X_test_scl = scl.transform(X_test) pca = PCA(n_components=3) X_train_transformed = pca.fit_transform(X_train_scl, y_train) X_test_transformed = pca.transform(X_test_scl) # save filename = './' + self.save_dir + '/wine_pca_x_train.txt' pd.DataFrame(X_train_transformed).to_csv(filename, header=False, index=False) filename = './' + self.save_dir + '/wine_pca_x_test.txt' pd.DataFrame(X_test_transformed).to_csv(filename, header=False, index=False) filename = './' + self.save_dir + '/wine_pca_y_train.txt' pd.DataFrame(y_train).to_csv(filename, header=False, index=False) filename = './' + self.save_dir + '/wine_pca_y_test.txt' pd.DataFrame(y_test).to_csv(filename, header=False, index=False)
def gmm_lda_spam(self): dh = data_helper() X_train, X_test, y_train, y_test = dh.get_spam_data_lda_best() self.part1.gmm_analysis(X_train, X_test, y_train, y_test, 'Spam', 20, 'GMM LDA')
def kmeans_nba(self): dh = data_helper() X_train, X_test, y_train, y_test = dh.get_nba_data() self.kmeans_analysis(X_train, X_test, y_train, y_test, 'NBA', 20)
tf.flags.DEFINE_integer("batch_size", 64, "Batch Size (default: 128)") tf.flags.DEFINE_integer("num_epochs", 50, "Number of training epochs (default: 50)") tf.flags.DEFINE_integer("evaluate_every", 50, "Evaluate model on dev set after this many steps (default: 50)") tf.flags.DEFINE_boolean("enable_tensorboard", True, "Enable Tensorboard (default: True)") FLAGS = tf.flags.FLAGS FLAGS.flag_values_dict() print("Parameters:") for attr, value in sorted(FLAGS.__flags.items()): print("{}={}".format(attr, value)) print("") # Data Preparation # Load data print("Loading data...") data_helper = data_helper(sequence_max_length=FLAGS.sequence_max_length) train_data, train_label, test_data, test_label = data_helper.load_dataset(FLAGS.database_path) # 加载数据 num_batches_per_epoch = int((len(train_data) - 1) / FLAGS.batch_size) + 1 print("Loading data succees...") # ConvNet acc_list = [0] sess = tf.Session() cnn = VDCNN(num_classes=train_label.shape[1], sequence_max_length=FLAGS.sequence_max_length, use_he_uniform=FLAGS.use_he_uniform) # Optimizer and LR Decay update_ops = tf.get_collection(tf.GraphKeys.UPDATE_OPS) with tf.control_dependencies(update_ops): global_step = tf.Variable(0, name="global_step", trainable=False)
def kmeans_lda_wine(self): dh = data_helper() X_train, X_test, y_train, y_test = dh.get_wine_data_lda_best() self.part1.kmeans_analysis(X_train, X_test, y_train, y_test, 'Wine', 20, 'K-Means LDA')
def gmm_lda_wine(self): dh = data_helper() X_train, X_test, y_train, y_test = dh.get_wine_data_lda_best() self.part1.gmm_analysis(X_train, X_test, y_train, y_test, 'Wine', 20, 'GMM LDA')
def gmm_wine(self): dh = data_helper() X_train, X_test, y_train, y_test = dh.get_wine_data() self.gmm_analysis(X_train, X_test, y_train, y_test, 'Wine', 30)
def lda_spam(self): dh = data_helper() X_train, X_test, y_train, y_test = dh.get_spam_data() self.lda_analysis(X_train, X_test, y_train, y_test, 'Spam')
def kmeans_lda_spam(self): dh = data_helper() X_train, X_test, y_train, y_test = dh.get_spam_data_lda_best() self.part1.kmeans_analysis(X_train, X_test, y_train, y_test, 'Spam', 20, 'K-Means LDA')
def gmm_lda_nba(self): dh = data_helper() X_train, X_test, y_train, y_test = dh.get_nba_data_lda_best() self.part1.gmm_analysis(X_train, X_test, y_train, y_test, 'NBA', 20, 'GMM LDA')
def kmeans_lda_nba(self): dh = data_helper() X_train, X_test, y_train, y_test = dh.get_nba_data_lda_best() self.part1.kmeans_analysis(X_train, X_test, y_train, y_test, 'NBA', 20, 'K-Means LDA')
def lda_pima(self): dh = data_helper() X_train, X_test, y_train, y_test = dh.get_pima_data() self.lda_analysis(X_train, X_test, y_train, y_test, 'Pima')
def lda_wine(self): dh = data_helper() X_train, X_test, y_train, y_test = dh.get_wine_data() self.lda_analysis(X_train, X_test, y_train, y_test, 'Wine')
def nn_lda_wine(self): dh = data_helper() X_train, X_test, y_train, y_test = dh.get_wine_data_lda_best() self.nn_analysis(X_train, X_test, y_train, y_test, "Wine", "Neural Network LDA")
def kmeans_spam(self): dh = data_helper() X_train, X_test, y_train, y_test = dh.get_spam_data() self.kmeans_analysis(X_train, X_test, y_train, y_test, 'Spam', 20)
def kmeans_rp_nba(self): dh = data_helper() X_train, X_test, y_train, y_test = dh.get_nba_data_rp_best() self.part1.kmeans_analysis(X_train, X_test, y_train, y_test, 'NBA', 20, 'K-Means RP')
def kmeans_pima(self): dh = data_helper() X_train, X_test, y_train, y_test = dh.get_pima_data() self.kmeans_analysis(X_train, X_test, y_train, y_test, 'Pima', 20)
def gmm_rp_nba(self): dh = data_helper() X_train, X_test, y_train, y_test = dh.get_nba_data_rp_best() self.part1.gmm_analysis(X_train, X_test, y_train, y_test, 'NBA', 20, 'GMM RP')
if __name__ == '__main__': # if len(sys.argv) != 2 or sys.argv[1] not in ['train', 'test','predict']: # raise ValueError("""usage: python run_abblstm.py [train / test / predict]""") print('Configuring VDCNN model...') is_char = 0 config = vdcnnConfig() # if not os.path.exists(vocab_dir): # 如果不存在词汇表,重建 # build_vocab(train_dir, vocab_dir, config.vocab_size,is_char) # build_vocab(train_dir, vocab_dir, config.vocab_size, is_char) # Data Preparation # Load data print("Loading data...") data_helper = data_helper(sequence_max_length=config.seq_length, train_file_path=train_dir) x_train, y_train, test_data, test_label, cat_to_id, id_to_cat, categories = data_helper.load_dataset( base_dir, 0) print("train size: ", len(x_train)) x_test, x_val, y_test, y_val = data_helper.split_dataset( test_data, test_label, 0.1) print("test size: ", len(x_test)) print("Validation size: ", len(x_val)) num_batches_per_epoch = int((len(x_train) - 1) / config.batch_size) + 1 print("num_batches_per_epoch size: ", num_batches_per_epoch) x_predict = data_helper.load_dataset_predict(base_dir, 0) print("predict size: ", len(x_predict)) print("Loading data succees...") config.num_classes = len(categories) config.num_batches_per_epoch = num_batches_per_epoch
tf.flags.DEFINE_integer("batch_size", 64, "Batch Size (default: 128)") tf.flags.DEFINE_integer("num_epochs", 100, "Number of training epochs (default: 50)") tf.flags.DEFINE_integer( "evaluate_every", 50, "Evaluate model on dev set after this many steps (default: 50)") tf.flags.DEFINE_boolean("enable_tensorboard", True, "Enable Tensorboard (default: True)") tf.flags.DEFINE_integer("save_every", 2000, "save model after this many steps (default: 2000)") FLAGS = tf.flags.FLAGS # Data Preparation # Load data print("Loading data...") data_helper = data_helper(sequence_max_length=FLAGS.sequence_max_length, use_title=FLAGS.use_title) train_data, train_label, train_texts, test_data, test_label, test_texts = data_helper.load_dataset( FLAGS.database_path) num_batches_per_epoch = int((len(train_data) - 1) / FLAGS.batch_size) + 1 print("Loading data succees...") # ConvNet acc_list = [0] sess = tf.Session() cnn = VDCNN(num_classes=train_label.shape[1], depth=FLAGS.depth, sequence_max_length=FLAGS.sequence_max_length, downsampling_type=FLAGS.downsampling_type, use_he_uniform=FLAGS.use_he_uniform, optional_shortcut=FLAGS.optional_shortcut)
def main(): ''' 设置data_helper和部分模型的超参数,利用这个配置文件给模型文件命名 参数解释: score_style: 取值:'mine';'adem' 说明:模型计算最终得分的方式,'mine'方式将三个输入拼接为了一个向量,通过两层全连接神经网络最终输出结果;'adem'采用论文计算方式,使用矩阵变换计算三个输入之间的相似度作为最终结果。 normal: 取值:True;False 说明:是否添加正则项 LR: 取值:数字 说明:学习率 cate: 取值:'mlut';'two' 说明:0-4的回归问题 或者 2分类问题 weight: 取值:True;False 说明:是否为数据添加权重,由于各个类别分布不均匀,据定是否为数据添加权重,权重与各类别的频率成反比。 data: 取值:'8';'9';'origin';'all' 说明:一共有三批不同的数据,默认用all seg: 取值:'nio';'jieba';'ipx' 说明:分词类型,分别为nio自己的分词,jieba 和 单字 prewordembedding: 取值:True;False 说明:是否使用预训练词向量 attflag: 取值:True;False 说明:是否使用attention机制 ''' config = {'score_style': 'mine', 'normal': True, 'LR': 1, 'cate': 'mlut', 'weight': True, 'data': 'all', 'seg': 'jieba', 'prewordembedding': False, 'attflag': True} dh = data_helper(config=config) #设置模型超参数,之所以要设置两个超参数是为了利用上一个超参数来命名模型 config_network = { 'HIDDEN_SIZE': 128, 'NUM_LAYERS': 1, 'SRC_VOCAB_SIZE': dh.vocab_size, 'KEEP_PROB': 0.8, 'MAX_GRAD_NORM': 5, 'word_embedding_file': 'word_dic_jieba_embedding.pk' if config[ 'seg'] == 'jieba' else 'word_dic_nioseg_embedding.pk', 'max_len':dh.max_len } model_name = tostring(config) CHECKPOINT_PATH = '../MODEL/' + model_name + '_ckpt' # 判断是否有同名模型存在 if os.path.exists(CHECKPOINT_PATH + '.index'): exists_flag = True else: exists_flag = False train_model=ADEM_model(config, config_network) saver=tf.train.Saver() step=0 max_loop=9999999 min_=9999 marks=[] #控制tensorflow的显存占比 config_tf = tf.ConfigProto() config_tf.gpu_options.per_process_gpu_memory_fraction = 0.9 config_tf.gpu_options.allow_growth = True with tf.Session(config=config_tf) as sess: tf.global_variables_initializer().run() #如果模型存在,那么载入后再做持续训练 if exists_flag: print('loading trained model') saver = tf.train.Saver() saver.restore(sess, CHECKPOINT_PATH) for i in range(max_loop): context_input_, refrence_input_, model_input_, context_sequence_length_, \ refrence_sequence_length_,model_sequence_length_,human_score_,grad_ys_=dh.next_batch(300) step=train_model.train_on_batch(sess,step,feed_dict={train_model.context_input:context_input_, train_model.context_sequence_length:context_sequence_length_, train_model.model_response_input:model_input_, train_model.refrence_response_input:refrence_input_, train_model.model_sequence_length:model_sequence_length_, train_model.refrence_sequence_length:refrence_sequence_length_, train_model.human_score:human_score_, train_model.grad_ys:grad_ys_ }) if i % 100 == 0 and i != 0: context_input_, refrence_input_, model_input_, context_sequence_length_, \ refrence_sequence_length_, model_sequence_length_, human_score_,grad_ys_= dh.get_val_data() model_score=train_model.predict_on_batch(sess,feed_dict={train_model.context_input:context_input_, train_model.context_sequence_length:context_sequence_length_, train_model.model_response_input:model_input_, train_model.refrence_response_input:refrence_input_, train_model.model_sequence_length:model_sequence_length_, train_model.refrence_sequence_length:refrence_sequence_length_, }) loss = train_model.mean_square_error(human_score_, model_score) if loss < min_: min_ = loss saver.save(sess, CHECKPOINT_PATH) marks.append([i, loss]) for k in marks: print(k) print(config)
def kmeans_wine(self): dh = data_helper() X_train, X_test, y_train, y_test = dh.get_wine_data() self.kmeans_analysis(X_train, X_test, y_train, y_test, 'Wine', 20)
def lda_nba(self): dh = data_helper() X_train, X_test, y_train, y_test = dh.get_nba_data() self.lda_analysis(X_train, X_test, y_train, y_test, 'NBA')
def kmeans_lda_pima(self): dh = data_helper() X_train, X_test, y_train, y_test = dh.get_pima_data_lda_best() self.part1.kmeans_analysis(X_train, X_test, y_train, y_test, 'Pima', 20, 'K-Means LDA')
def gmm_pima(self): dh = data_helper() X_train, X_test, y_train, y_test = dh.get_pima_data() self.gmm_analysis(X_train, X_test, y_train, y_test, 'Pima', 30)
def gmm_lda_pima(self): dh = data_helper() X_train, X_test, y_train, y_test = dh.get_pima_data_lda_best() self.part1.gmm_analysis(X_train, X_test, y_train, y_test, 'Pima', 20, 'GMM LDA')
def gmm_nba(self): dh = data_helper() X_train, X_test, y_train, y_test = dh.get_nba_data() self.gmm_analysis(X_train, X_test, y_train, y_test, 'NBA', 30)