def best_lda_cluster_spam(self):
        dh = data_helper()
        dh = data_helper()
        X_train, X_test, y_train, y_test = dh.get_spam_data_lda_best()

        scl = RobustScaler()
        X_train_scl = scl.fit_transform(X_train)
        X_test_scl = scl.transform(X_test)

        ##
        ## K-Means
        ##
        km = KMeans(n_clusters=4, algorithm='full')
        X_train_transformed = km.fit_transform(X_train_scl)
        X_test_transformed = km.transform(X_test_scl)

        # save
        filename = './' + self.save_dir + '/spam_kmeans_lda_x_train.txt'
        pd.DataFrame(X_train_transformed).to_csv(filename,
                                                 header=False,
                                                 index=False)

        filename = './' + self.save_dir + '/spam_kmeans_lda_x_test.txt'
        pd.DataFrame(X_test_transformed).to_csv(filename,
                                                header=False,
                                                index=False)

        filename = './' + self.save_dir + '/spam_kmeans_lda_y_train.txt'
        pd.DataFrame(y_train).to_csv(filename, header=False, index=False)

        filename = './' + self.save_dir + '/spam_kmeans_lda_y_test.txt'
        pd.DataFrame(y_test).to_csv(filename, header=False, index=False)

        ##
        ## GMM
        ##
        gmm = GaussianMixture(n_components=4, covariance_type='full')
        X_train_transformed = km.fit_transform(X_train_scl)
        X_test_transformed = km.transform(X_test_scl)

        # save
        filename = './' + self.save_dir + '/spam_gmm_lda_x_train.txt'
        pd.DataFrame(X_train_transformed).to_csv(filename,
                                                 header=False,
                                                 index=False)

        filename = './' + self.save_dir + '/spam_gmm_lda_x_test.txt'
        pd.DataFrame(X_test_transformed).to_csv(filename,
                                                header=False,
                                                index=False)

        filename = './' + self.save_dir + '/spam_gmm_lda_y_train.txt'
        pd.DataFrame(y_train).to_csv(filename, header=False, index=False)

        filename = './' + self.save_dir + '/spam_gmm_lda_y_test.txt'
        pd.DataFrame(y_test).to_csv(filename, header=False, index=False)
Example #2
0
 def best_lda_cluster_wine(self):
     dh = data_helper()
     dh = data_helper()
     X_train, X_test, y_train, y_test = dh.get_wine_data_lda_best()
     
     scl = RobustScaler()
     X_train_scl = scl.fit_transform(X_train)
     X_test_scl = scl.transform(X_test)
     
     ##
     ## K-Means
     ##
     km = KMeans(n_clusters=4, algorithm='full')
     X_train_transformed = km.fit_transform(X_train_scl)
     X_test_transformed = km.transform(X_test_scl)
     
     # save
     filename = './' + self.save_dir + '/wine_kmeans_lda_x_train.txt'
     pd.DataFrame(X_train_transformed).to_csv(filename, header=False, index=False)
     
     filename = './' + self.save_dir + '/wine_kmeans_lda_x_test.txt'
     pd.DataFrame(X_test_transformed).to_csv(filename, header=False, index=False)
     
     filename = './' + self.save_dir + '/wine_kmeans_lda_y_train.txt'
     pd.DataFrame(y_train).to_csv(filename, header=False, index=False)
     
     filename = './' + self.save_dir + '/wine_kmeans_lda_y_test.txt'
     pd.DataFrame(y_test).to_csv(filename, header=False, index=False)
     
     ##
     ## GMM
     ##
     gmm = GaussianMixture(n_components=4, covariance_type='full')
     X_train_transformed = km.fit_transform(X_train_scl)
     X_test_transformed = km.transform(X_test_scl)
     
     # save
     filename = './' + self.save_dir + '/wine_gmm_lda_x_train.txt'
     pd.DataFrame(X_train_transformed).to_csv(filename, header=False, index=False)
     
     filename = './' + self.save_dir + '/wine_gmm_lda_x_test.txt'
     pd.DataFrame(X_test_transformed).to_csv(filename, header=False, index=False)
     
     filename = './' + self.save_dir + '/wine_gmm_lda_y_train.txt'
     pd.DataFrame(y_train).to_csv(filename, header=False, index=False)
     
     filename = './' + self.save_dir + '/wine_gmm_lda_y_test.txt'
     pd.DataFrame(y_test).to_csv(filename, header=False, index=False)
Example #3
0
def predict_on_line_demo(CHECKPOINT_PATH,
                         line=u'我最近好累啊\t要好好休息,知道吗\t知道你过得不好我就放心了'):

    config = resolve_filename(CHECKPOINT_PATH)
    dh = data_helper(config)

    config_network = {
        'HIDDEN_SIZE':
        256,
        'NUM_LAYERS':
        2,
        'SRC_VOCAB_SIZE':
        dh.vocab_size,
        'KEEP_PROB':
        1,
        'MAX_GRAD_NORM':
        5,
        'word_embedding_file':
        'word_dic_jieba_embedding.pk'
        if config['seg'] == 'jieba' else 'word_dic_nioseg_embedding.pk',
        'max_len':
        dh.max_len
    }
    with tf.Session() as sess:
        train_model = ADEM_model(config, config_network)
        tf.global_variables_initializer().run()

        saver = tf.train.Saver()
        saver.restore(sess, CHECKPOINT_PATH)

        train_model.set_word_dic('word_dic_jieba')
        print(train_model.predict_on_line(sess=sess, line=line))
Example #4
0
 def best_rp_nba(self):
     dh = data_helper()
     X_train, X_test, y_train, y_test = dh.get_nba_data()
     
     scl = RobustScaler()
     X_train_scl = scl.fit_transform(X_train)
     X_test_scl = scl.transform(X_test)
     
     rp = GaussianRandomProjection(n_components=X_train_scl.shape[1])
     X_train_transformed = rp.fit_transform(X_train_scl, y_train)
     X_test_transformed = rp.transform(X_test_scl)
     
     ## top 2
     kurt = kurtosis(X_train_transformed)
     i = kurt.argsort()[::-1]
     X_train_transformed_sorted = X_train_transformed[:, i]
     X_train_transformed = X_train_transformed_sorted[:,0:2]
     
     kurt = kurtosis(X_test_transformed)
     i = kurt.argsort()[::-1]
     X_test_transformed_sorted = X_test_transformed[:, i]
     X_test_transformed = X_test_transformed_sorted[:,0:2]
     
     # save
     filename = './' + self.save_dir + '/nba_rp_x_train.txt'
     pd.DataFrame(X_train_transformed).to_csv(filename, header=False, index=False)
     
     filename = './' + self.save_dir + '/nba_rp_x_test.txt'
     pd.DataFrame(X_test_transformed).to_csv(filename, header=False, index=False)
     
     filename = './' + self.save_dir + '/nba_rp_y_train.txt'
     pd.DataFrame(y_train).to_csv(filename, header=False, index=False)
     
     filename = './' + self.save_dir + '/nba_rp_y_test.txt'
     pd.DataFrame(y_test).to_csv(filename, header=False, index=False)
    def best_lda_pima(self):
        dh = data_helper()
        X_train, X_test, y_train, y_test = dh.get_pima_data()

        scl = RobustScaler()
        X_train_scl = scl.fit_transform(X_train)
        X_test_scl = scl.transform(X_test)

        lda = LinearDiscriminantAnalysis(n_components=2)
        X_train_transformed = lda.fit_transform(X_train_scl, y_train)
        X_test_transformed = lda.transform(X_test_scl)

        # save
        filename = './' + self.save_dir + '/pima_lda_x_train.txt'
        pd.DataFrame(X_train_transformed).to_csv(filename,
                                                 header=False,
                                                 index=False)

        filename = './' + self.save_dir + '/pima_lda_x_test.txt'
        pd.DataFrame(X_test_transformed).to_csv(filename,
                                                header=False,
                                                index=False)

        filename = './' + self.save_dir + '/pima_lda_y_train.txt'
        pd.DataFrame(y_train).to_csv(filename, header=False, index=False)

        filename = './' + self.save_dir + '/pima_lda_y_test.txt'
        pd.DataFrame(y_test).to_csv(filename, header=False, index=False)
Example #6
0
 def nn_rp_cluster_wine(self):
     dh = data_helper()
     X_train, X_test, y_train, y_test = dh.get_wine_data_kmeans_rp_best()
     self.part4.nn_analysis(X_train, X_test, y_train, y_test, 'Wine', 'Neural Network RP K-Means')
     
     X_train, X_test, y_train, y_test = dh.get_wine_data_gmm_rp_best()
     self.part4.nn_analysis(X_train, X_test, y_train, y_test, 'Wine', 'Neural Network RP GMM')
Example #7
0
 def best_ica_wine(self):
     dh = data_helper()
     X_train, X_test, y_train, y_test = dh.get_wine_data()
     
     scl = RobustScaler()
     X_train_scl = scl.fit_transform(X_train)
     X_test_scl = scl.transform(X_test)
     
     ica = FastICA(n_components=X_train_scl.shape[1])
     X_train_transformed = ica.fit_transform(X_train_scl, y_train)
     X_test_transformed = ica.transform(X_test_scl)
     
     ## top 2
     kurt = kurtosis(X_train_transformed)
     i = kurt.argsort()[::-1]
     X_train_transformed_sorted = X_train_transformed[:, i]
     X_train_transformed = X_train_transformed_sorted[:,0:2]
     
     kurt = kurtosis(X_test_transformed)
     i = kurt.argsort()[::-1]
     X_test_transformed_sorted = X_test_transformed[:, i]
     X_test_transformed = X_test_transformed_sorted[:,0:2]
     
     # save
     filename = './' + self.save_dir + '/wine_ica_x_train.txt'
     pd.DataFrame(X_train_transformed).to_csv(filename, header=False, index=False)
     
     filename = './' + self.save_dir + '/wine_ica_x_test.txt'
     pd.DataFrame(X_test_transformed).to_csv(filename, header=False, index=False)
     
     filename = './' + self.save_dir + '/wine_ica_y_train.txt'
     pd.DataFrame(y_train).to_csv(filename, header=False, index=False)
     
     filename = './' + self.save_dir + '/wine_ica_y_test.txt'
     pd.DataFrame(y_test).to_csv(filename, header=False, index=False)
    def best_pca_spam(self):
        dh = data_helper()
        X_train, X_test, y_train, y_test = dh.get_spam_data()

        scl = RobustScaler()
        X_train_scl = scl.fit_transform(X_train)
        X_test_scl = scl.transform(X_test)

        pca = PCA(n_components=3)
        X_train_transformed = pca.fit_transform(X_train_scl, y_train)
        X_test_transformed = pca.transform(X_test_scl)

        # save
        filename = './' + self.save_dir + '/spam_pca_x_train.txt'
        pd.DataFrame(X_train_transformed).to_csv(filename,
                                                 header=False,
                                                 index=False)

        filename = './' + self.save_dir + '/spam_pca_x_test.txt'
        pd.DataFrame(X_test_transformed).to_csv(filename,
                                                header=False,
                                                index=False)

        filename = './' + self.save_dir + '/spam_pca_y_train.txt'
        pd.DataFrame(y_train).to_csv(filename, header=False, index=False)

        filename = './' + self.save_dir + '/spam_pca_y_test.txt'
        pd.DataFrame(y_test).to_csv(filename, header=False, index=False)
Example #9
0
 def nn_wine_orig(self):
     dh = data_helper()
     X_train, X_test, y_train, y_test = dh.get_wine_data()
     
     scl = RobustScaler()
     X_train_scl = scl.fit_transform(X_train)
     X_test_scl = scl.transform(X_test)
     
     self.part4.nn_analysis(X_train_scl, X_test_scl, y_train, y_test, 'Wine', 'Neural Network Original')
Example #10
0
    def nn_ica_cluster_spam(self):
        dh = data_helper()
        X_train, X_test, y_train, y_test = dh.get_spam_data_kmeans_ica_best()
        self.part4.nn_analysis(X_train, X_test, y_train, y_test, 'Spam',
                               'Neural Network ICA K-Means')

        X_train, X_test, y_train, y_test = dh.get_spam_data_gmm_ica_best()
        self.part4.nn_analysis(X_train, X_test, y_train, y_test, 'Spam',
                               'Neural Network ICA GMM')
Example #11
0
 def nba_cluster_plots(self):
     dh = data_helper()
     X_train, X_test, y_train, y_test = dh.get_nba_data()
     
     x_col_names = ['Shot Distance', 'Closest Defender Distance', 'Number Dribbles']
     df = pd.DataFrame(X_train)
     df.columns = x_col_names
     
     self.cluster_plot(df, 5, 'KMeans', 'NBA')
     self.cluster_plot(df, 10, 'GaussianMixture', 'NBA')
     self.cluster_3d_plot(df, 5, 'KMeans', 'NBA')
     self.cluster_3d_plot(df, 10, 'GaussianMixture', 'NBA')
Example #12
0
 def wine_cluster_plots(self):
     dh = data_helper()
     X_train, X_test, y_train, y_test = dh.get_wine_data()
     
     x_col_names = ['Alcohol', 'Volatile Acidity', 'Sulphates', 'pH']
     df = pd.DataFrame(X_train)
     df.columns = x_col_names
     
     self.cluster_plot(df, 5, 'KMeans', 'Wine')
     self.cluster_plot(df, 5, 'GaussianMixture', 'Wine')
     self.cluster_plot(df, 4, 'GaussianMixture', 'Wine')
     self.cluster_3d_plot(df, 5, 'KMeans', 'Wine')
     self.cluster_3d_plot(df, 5, 'GaussianMixture', 'Wine')
     self.cluster_3d_plot(df, 4, 'GaussianMixture', 'Wine')
Example #13
0
def predict_on_file_demo(CHECKPOINT_PATH, file_list):
    config = resolve_filename(CHECKPOINT_PATH)
    dh = data_helper(config)
    config_network = {
        'HIDDEN_SIZE':
        256,
        'NUM_LAYERS':
        2,
        'SRC_VOCAB_SIZE':
        dh.vocab_size,
        'KEEP_PROB':
        1,
        'MAX_GRAD_NORM':
        5,
        'word_embedding_file':
        'word_dic_jieba_embedding.pk'
        if config['seg'] == 'jieba' else 'word_dic_nioseg_embedding.pk',
        'max_len':
        dh.max_len
    }
    with tf.Session() as sess:
        train_model = ADEM_model(config, config_network)
        tf.global_variables_initializer().run()

        saver = tf.train.Saver()
        saver.restore(sess, CHECKPOINT_PATH)

        word_dic = config['seg'] if not config['seg'] == 'nio' else 'nioseg'

        for file in file_list:
            context_input, refrence_input, model_input, context_sequence_length, \
            refrence_sequence_length, model_sequence_length = dh.get_specific_data(file + '_idx_' + word_dic)
            predict_score = train_model.predict_on_batch(
                sess,
                feed_dict={
                    train_model.context_input: context_input,
                    train_model.context_sequence_length:
                    context_sequence_length,
                    train_model.model_response_input: model_input,
                    train_model.refrence_response_input: refrence_input,
                    train_model.model_sequence_length: model_sequence_length,
                    train_model.refrence_sequence_length:
                    refrence_sequence_length,
                })
            std_score = np.zeros(len(predict_score))
            print(RMSE(predict_score, std_score))
            predict_score = np.reshape(predict_score, [len(predict_score)])
            write_in_file(file, predict_score, word_dic)
Example #14
0
    def nba_cluster_plots(self):
        dh = data_helper()
        X_train, X_test, y_train, y_test = dh.get_nba_data_pca_best()

        df = pd.DataFrame(X_train)

        self.part1.cluster_plot(df, 3, 'KMeans', 'NBA', 'K-Means PCA')
        self.part1.cluster_plot(df, 8, 'GaussianMixture', 'NBA', 'GMM PCA')
        if df.shape[1] >= 3:
            self.part1.cluster_3d_plot(df, 3, 'KMeans', 'NBA', 'K-Means PCA')
            self.part1.cluster_3d_plot(df, 8, 'GaussianMixture', 'NBA',
                                       'GMM PCA')

        X_train, X_test, y_train, y_test = dh.get_nba_data_ica_best()

        df = pd.DataFrame(X_train)

        self.part1.cluster_plot(df, 3, 'KMeans', 'NBA', 'K-Means ICA')
        self.part1.cluster_plot(df, 5, 'GaussianMixture', 'NBA', 'GMM ICA')
        if df.shape[1] >= 3:
            self.part1.cluster_3d_plot(df, 3, 'KMeans', 'NBA', 'K-Means ICA')
            self.part1.cluster_3d_plot(df, 5, 'GaussianMixture', 'NBA',
                                       'GMM ICA')

        X_train, X_test, y_train, y_test = dh.get_nba_data_lda_best()

        df = pd.DataFrame(X_train)

        self.part1.cluster_plot(df, 4, 'KMeans', 'NBA', 'K-Means LDA')
        self.part1.cluster_plot(df, 3, 'GaussianMixture', 'NBA', 'GMM LDA')
        if df.shape[1] >= 3:
            self.part1.cluster_3d_plot(df, 4, 'KMeans', 'NBA', 'K-Means LDA')
            self.part1.cluster_3d_plot(df, 3, 'GaussianMixture', 'NBA',
                                       'GMM LDA')

        X_train, X_test, y_train, y_test = dh.get_nba_data_rp_best()

        df = pd.DataFrame(X_train)

        self.part1.cluster_plot(df, 4, 'KMeans', 'NBA', 'K-Means RP')
        self.part1.cluster_plot(df, 5, 'GaussianMixture', 'NBA', 'GMM RP')
        if df.shape[1] >= 3:
            self.part1.cluster_3d_plot(df, 4, 'KMeans', 'NBA', 'K-Means RP')
            self.part1.cluster_3d_plot(df, 5, 'GaussianMixture', 'NBA',
                                       'GMM RP')
Example #15
0
 def nba_cluster_plots(self):
     dh = data_helper()
     X_train, X_test, y_train, y_test = dh.get_nba_data_pca_best()
     
     df = pd.DataFrame(X_train)
     
     self.part1.cluster_plot(df, 3, 'KMeans', 'NBA', 'K-Means PCA')
     self.part1.cluster_plot(df, 8, 'GaussianMixture', 'NBA', 'GMM PCA')
     if df.shape[1] >=3:
         self.part1.cluster_3d_plot(df, 3, 'KMeans', 'NBA', 'K-Means PCA')
         self.part1.cluster_3d_plot(df, 8, 'GaussianMixture', 'NBA', 'GMM PCA')
     
     
     X_train, X_test, y_train, y_test = dh.get_nba_data_ica_best()
     
     df = pd.DataFrame(X_train)
     
     self.part1.cluster_plot(df, 3, 'KMeans', 'NBA', 'K-Means ICA')
     self.part1.cluster_plot(df, 5, 'GaussianMixture', 'NBA', 'GMM ICA')
     if df.shape[1] >=3:
         self.part1.cluster_3d_plot(df, 3, 'KMeans', 'NBA', 'K-Means ICA')
         self.part1.cluster_3d_plot(df, 5, 'GaussianMixture', 'NBA', 'GMM ICA')
     
     
     X_train, X_test, y_train, y_test = dh.get_nba_data_lda_best()
     
     df = pd.DataFrame(X_train)
     
     self.part1.cluster_plot(df, 4, 'KMeans', 'NBA', 'K-Means LDA')
     self.part1.cluster_plot(df, 3, 'GaussianMixture', 'NBA', 'GMM LDA')
     if df.shape[1] >=3:
         self.part1.cluster_3d_plot(df, 4, 'KMeans', 'NBA', 'K-Means LDA')
         self.part1.cluster_3d_plot(df, 3, 'GaussianMixture', 'NBA', 'GMM LDA')
     
     
     X_train, X_test, y_train, y_test = dh.get_nba_data_rp_best()
     
     df = pd.DataFrame(X_train)
     
     self.part1.cluster_plot(df, 4, 'KMeans', 'NBA', 'K-Means RP')
     self.part1.cluster_plot(df, 5, 'GaussianMixture', 'NBA', 'GMM RP')
     if df.shape[1] >=3:
         self.part1.cluster_3d_plot(df, 4, 'KMeans', 'NBA', 'K-Means RP')
         self.part1.cluster_3d_plot(df, 5, 'GaussianMixture', 'NBA', 'GMM RP')
Example #16
0
    def best_rp_pima(self):
        dh = data_helper()
        X_train, X_test, y_train, y_test = dh.get_pima_data()

        scl = RobustScaler()
        X_train_scl = scl.fit_transform(X_train)
        X_test_scl = scl.transform(X_test)

        rp = GaussianRandomProjection(n_components=X_train_scl.shape[1])
        X_train_transformed = rp.fit_transform(X_train_scl, y_train)
        X_test_transformed = rp.transform(X_test_scl)

        ## top 2
        kurt = kurtosis(X_train_transformed)
        i = kurt.argsort()[::-1]
        X_train_transformed_sorted = X_train_transformed[:, i]
        X_train_transformed = X_train_transformed_sorted[:, 0:2]

        kurt = kurtosis(X_test_transformed)
        i = kurt.argsort()[::-1]
        X_test_transformed_sorted = X_test_transformed[:, i]
        X_test_transformed = X_test_transformed_sorted[:, 0:2]

        # save
        filename = './' + self.save_dir + '/pima_rp_x_train.txt'
        pd.DataFrame(X_train_transformed).to_csv(filename,
                                                 header=False,
                                                 index=False)

        filename = './' + self.save_dir + '/pima_rp_x_test.txt'
        pd.DataFrame(X_test_transformed).to_csv(filename,
                                                header=False,
                                                index=False)

        filename = './' + self.save_dir + '/pima_rp_y_train.txt'
        pd.DataFrame(y_train).to_csv(filename, header=False, index=False)

        filename = './' + self.save_dir + '/pima_rp_y_test.txt'
        pd.DataFrame(y_test).to_csv(filename, header=False, index=False)
Example #17
0
    def best_ica_spam(self):
        dh = data_helper()
        X_train, X_test, y_train, y_test = dh.get_spam_data()

        scl = RobustScaler()
        X_train_scl = scl.fit_transform(X_train)
        X_test_scl = scl.transform(X_test)

        ica = FastICA(n_components=X_train_scl.shape[1])
        X_train_transformed = ica.fit_transform(X_train_scl, y_train)
        X_test_transformed = ica.transform(X_test_scl)

        ## top 2
        kurt = kurtosis(X_train_transformed)
        i = kurt.argsort()[::-1]
        X_train_transformed_sorted = X_train_transformed[:, i]
        X_train_transformed = X_train_transformed_sorted[:, 0:2]

        kurt = kurtosis(X_test_transformed)
        i = kurt.argsort()[::-1]
        X_test_transformed_sorted = X_test_transformed[:, i]
        X_test_transformed = X_test_transformed_sorted[:, 0:2]

        # save
        filename = './' + self.save_dir + '/spam_ica_x_train.txt'
        pd.DataFrame(X_train_transformed).to_csv(filename,
                                                 header=False,
                                                 index=False)

        filename = './' + self.save_dir + '/spam_ica_x_test.txt'
        pd.DataFrame(X_test_transformed).to_csv(filename,
                                                header=False,
                                                index=False)

        filename = './' + self.save_dir + '/spam_ica_y_train.txt'
        pd.DataFrame(y_train).to_csv(filename, header=False, index=False)

        filename = './' + self.save_dir + '/spam_ica_y_test.txt'
        pd.DataFrame(y_test).to_csv(filename, header=False, index=False)
Example #18
0
 def best_lda_nba(self):
     dh = data_helper()
     X_train, X_test, y_train, y_test = dh.get_nba_data()
     
     scl = RobustScaler()
     X_train_scl = scl.fit_transform(X_train)
     X_test_scl = scl.transform(X_test)
     
     lda = LinearDiscriminantAnalysis(n_components=2)
     X_train_transformed = lda.fit_transform(X_train_scl, y_train)
     X_test_transformed = lda.transform(X_test_scl)
     
     # save
     filename = './' + self.save_dir + '/nba_lda_x_train.txt'
     pd.DataFrame(X_train_transformed).to_csv(filename, header=False, index=False)
     
     filename = './' + self.save_dir + '/nba_lda_x_test.txt'
     pd.DataFrame(X_test_transformed).to_csv(filename, header=False, index=False)
     
     filename = './' + self.save_dir + '/nba_lda_y_train.txt'
     pd.DataFrame(y_train).to_csv(filename, header=False, index=False)
     
     filename = './' + self.save_dir + '/nba_lda_y_test.txt'
     pd.DataFrame(y_test).to_csv(filename, header=False, index=False)
Example #19
0
 def best_pca_wine(self):
     dh = data_helper()
     X_train, X_test, y_train, y_test = dh.get_wine_data()
     
     scl = RobustScaler()
     X_train_scl = scl.fit_transform(X_train)
     X_test_scl = scl.transform(X_test)
     
     pca = PCA(n_components=3)
     X_train_transformed = pca.fit_transform(X_train_scl, y_train)
     X_test_transformed = pca.transform(X_test_scl)
     
     # save
     filename = './' + self.save_dir + '/wine_pca_x_train.txt'
     pd.DataFrame(X_train_transformed).to_csv(filename, header=False, index=False)
     
     filename = './' + self.save_dir + '/wine_pca_x_test.txt'
     pd.DataFrame(X_test_transformed).to_csv(filename, header=False, index=False)
     
     filename = './' + self.save_dir + '/wine_pca_y_train.txt'
     pd.DataFrame(y_train).to_csv(filename, header=False, index=False)
     
     filename = './' + self.save_dir + '/wine_pca_y_test.txt'
     pd.DataFrame(y_test).to_csv(filename, header=False, index=False)
Example #20
0
 def gmm_lda_spam(self):
     dh = data_helper()
     X_train, X_test, y_train, y_test = dh.get_spam_data_lda_best()
     self.part1.gmm_analysis(X_train, X_test, y_train, y_test, 'Spam', 20,
                             'GMM LDA')
Example #21
0
 def kmeans_nba(self):
     dh = data_helper()
     X_train, X_test, y_train, y_test = dh.get_nba_data()
     self.kmeans_analysis(X_train, X_test, y_train, y_test, 'NBA', 20)
Example #22
0
tf.flags.DEFINE_integer("batch_size", 64, "Batch Size (default: 128)")
tf.flags.DEFINE_integer("num_epochs", 50, "Number of training epochs (default: 50)")
tf.flags.DEFINE_integer("evaluate_every", 50, "Evaluate model on dev set after this many steps (default: 50)")
tf.flags.DEFINE_boolean("enable_tensorboard", True, "Enable Tensorboard (default: True)")

FLAGS = tf.flags.FLAGS
FLAGS.flag_values_dict()
print("Parameters:")
for attr, value in sorted(FLAGS.__flags.items()):
    print("{}={}".format(attr, value))
print("")

# Data Preparation
# Load data
print("Loading data...")
data_helper = data_helper(sequence_max_length=FLAGS.sequence_max_length)
train_data, train_label, test_data, test_label = data_helper.load_dataset(FLAGS.database_path)  # 加载数据
num_batches_per_epoch = int((len(train_data) - 1) / FLAGS.batch_size) + 1
print("Loading data succees...")

# ConvNet
acc_list = [0]
sess = tf.Session()
cnn = VDCNN(num_classes=train_label.shape[1],
            sequence_max_length=FLAGS.sequence_max_length,
            use_he_uniform=FLAGS.use_he_uniform)

# Optimizer and LR Decay
update_ops = tf.get_collection(tf.GraphKeys.UPDATE_OPS)
with tf.control_dependencies(update_ops):
    global_step = tf.Variable(0, name="global_step", trainable=False)
Example #23
0
 def kmeans_lda_wine(self):
     dh = data_helper()
     X_train, X_test, y_train, y_test = dh.get_wine_data_lda_best()
     self.part1.kmeans_analysis(X_train, X_test, y_train, y_test, 'Wine',
                                20, 'K-Means LDA')
Example #24
0
 def gmm_lda_wine(self):
     dh = data_helper()
     X_train, X_test, y_train, y_test = dh.get_wine_data_lda_best()
     self.part1.gmm_analysis(X_train, X_test, y_train, y_test, 'Wine', 20,
                             'GMM LDA')
Example #25
0
 def gmm_wine(self):
     dh = data_helper()
     X_train, X_test, y_train, y_test = dh.get_wine_data()
     self.gmm_analysis(X_train, X_test, y_train, y_test, 'Wine', 30)
Example #26
0
 def kmeans_lda_wine(self):
     dh = data_helper()
     X_train, X_test, y_train, y_test = dh.get_wine_data_lda_best()
     self.part1.kmeans_analysis(X_train, X_test, y_train, y_test, 'Wine', 20, 'K-Means LDA')
Example #27
0
 def lda_spam(self):
     dh = data_helper()
     X_train, X_test, y_train, y_test = dh.get_spam_data()
     self.lda_analysis(X_train, X_test, y_train, y_test, 'Spam')
Example #28
0
 def kmeans_lda_spam(self):
     dh = data_helper()
     X_train, X_test, y_train, y_test = dh.get_spam_data_lda_best()
     self.part1.kmeans_analysis(X_train, X_test, y_train, y_test, 'Spam',
                                20, 'K-Means LDA')
Example #29
0
 def gmm_lda_nba(self):
     dh = data_helper()
     X_train, X_test, y_train, y_test = dh.get_nba_data_lda_best()
     self.part1.gmm_analysis(X_train, X_test, y_train, y_test, 'NBA', 20, 'GMM LDA')
Example #30
0
 def gmm_lda_wine(self):
     dh = data_helper()
     X_train, X_test, y_train, y_test = dh.get_wine_data_lda_best()
     self.part1.gmm_analysis(X_train, X_test, y_train, y_test, 'Wine', 20, 'GMM LDA')
Example #31
0
 def kmeans_lda_nba(self):
     dh = data_helper()
     X_train, X_test, y_train, y_test = dh.get_nba_data_lda_best()
     self.part1.kmeans_analysis(X_train, X_test, y_train, y_test, 'NBA', 20, 'K-Means LDA')
Example #32
0
 def lda_pima(self):
     dh = data_helper()
     X_train, X_test, y_train, y_test = dh.get_pima_data()
     self.lda_analysis(X_train, X_test, y_train, y_test, 'Pima')
Example #33
0
 def lda_wine(self):
     dh = data_helper()
     X_train, X_test, y_train, y_test = dh.get_wine_data()
     self.lda_analysis(X_train, X_test, y_train, y_test, 'Wine')
Example #34
0
 def nn_lda_wine(self):
     dh = data_helper()
     X_train, X_test, y_train, y_test = dh.get_wine_data_lda_best()
     self.nn_analysis(X_train, X_test, y_train, y_test, "Wine", "Neural Network LDA")
Example #35
0
 def kmeans_spam(self):
     dh = data_helper()
     X_train, X_test, y_train, y_test = dh.get_spam_data()
     self.kmeans_analysis(X_train, X_test, y_train, y_test, 'Spam', 20)
Example #36
0
 def kmeans_rp_nba(self):
     dh = data_helper()
     X_train, X_test, y_train, y_test = dh.get_nba_data_rp_best()
     self.part1.kmeans_analysis(X_train, X_test, y_train, y_test, 'NBA', 20,
                                'K-Means RP')
Example #37
0
 def kmeans_pima(self):
     dh = data_helper()
     X_train, X_test, y_train, y_test = dh.get_pima_data()
     self.kmeans_analysis(X_train, X_test, y_train, y_test, 'Pima', 20)
Example #38
0
 def gmm_rp_nba(self):
     dh = data_helper()
     X_train, X_test, y_train, y_test = dh.get_nba_data_rp_best()
     self.part1.gmm_analysis(X_train, X_test, y_train, y_test, 'NBA', 20,
                             'GMM RP')
Example #39
0
if __name__ == '__main__':
    # if len(sys.argv) != 2 or sys.argv[1] not in ['train', 'test','predict']:
    #     raise ValueError("""usage: python run_abblstm.py [train / test / predict]""")

    print('Configuring VDCNN model...')
    is_char = 0
    config = vdcnnConfig()
    # if not os.path.exists(vocab_dir):  # 如果不存在词汇表,重建
    #     build_vocab(train_dir, vocab_dir, config.vocab_size,is_char)
    # build_vocab(train_dir, vocab_dir, config.vocab_size, is_char)

    # Data Preparation
    # Load data
    print("Loading data...")
    data_helper = data_helper(sequence_max_length=config.seq_length,
                              train_file_path=train_dir)
    x_train, y_train, test_data, test_label, cat_to_id, id_to_cat, categories = data_helper.load_dataset(
        base_dir, 0)
    print("train size: ", len(x_train))
    x_test, x_val, y_test, y_val = data_helper.split_dataset(
        test_data, test_label, 0.1)
    print("test size: ", len(x_test))
    print("Validation size: ", len(x_val))
    num_batches_per_epoch = int((len(x_train) - 1) / config.batch_size) + 1
    print("num_batches_per_epoch size: ", num_batches_per_epoch)
    x_predict = data_helper.load_dataset_predict(base_dir, 0)
    print("predict size: ", len(x_predict))
    print("Loading data succees...")

    config.num_classes = len(categories)
    config.num_batches_per_epoch = num_batches_per_epoch
Example #40
0
tf.flags.DEFINE_integer("batch_size", 64, "Batch Size (default: 128)")
tf.flags.DEFINE_integer("num_epochs", 100,
                        "Number of training epochs (default: 50)")
tf.flags.DEFINE_integer(
    "evaluate_every", 50,
    "Evaluate model on dev set after this many steps (default: 50)")
tf.flags.DEFINE_boolean("enable_tensorboard", True,
                        "Enable Tensorboard (default: True)")
tf.flags.DEFINE_integer("save_every", 2000,
                        "save model after this many steps (default: 2000)")

FLAGS = tf.flags.FLAGS
# Data Preparation
# Load data
print("Loading data...")
data_helper = data_helper(sequence_max_length=FLAGS.sequence_max_length,
                          use_title=FLAGS.use_title)
train_data, train_label, train_texts, test_data, test_label, test_texts = data_helper.load_dataset(
    FLAGS.database_path)
num_batches_per_epoch = int((len(train_data) - 1) / FLAGS.batch_size) + 1
print("Loading data succees...")

# ConvNet
acc_list = [0]
sess = tf.Session()
cnn = VDCNN(num_classes=train_label.shape[1],
            depth=FLAGS.depth,
            sequence_max_length=FLAGS.sequence_max_length,
            downsampling_type=FLAGS.downsampling_type,
            use_he_uniform=FLAGS.use_he_uniform,
            optional_shortcut=FLAGS.optional_shortcut)
Example #41
0
def main():
    '''
    设置data_helper和部分模型的超参数,利用这个配置文件给模型文件命名
    参数解释:
        score_style:
            取值:'mine';'adem'
            说明:模型计算最终得分的方式,'mine'方式将三个输入拼接为了一个向量,通过两层全连接神经网络最终输出结果;'adem'采用论文计算方式,使用矩阵变换计算三个输入之间的相似度作为最终结果。

        normal:
            取值:True;False
            说明:是否添加正则项

        LR:
            取值:数字
            说明:学习率

        cate:
            取值:'mlut';'two'
            说明:0-4的回归问题 或者 2分类问题

        weight:
            取值:True;False
            说明:是否为数据添加权重,由于各个类别分布不均匀,据定是否为数据添加权重,权重与各类别的频率成反比。

        data:
            取值:'8';'9';'origin';'all'
            说明:一共有三批不同的数据,默认用all

        seg:
            取值:'nio';'jieba';'ipx'
            说明:分词类型,分别为nio自己的分词,jieba 和 单字

        prewordembedding:
            取值:True;False
            说明:是否使用预训练词向量

        attflag:
            取值:True;False
            说明:是否使用attention机制
    '''
    config = {'score_style': 'mine',
              'normal': True,
              'LR': 1,
              'cate': 'mlut',
              'weight': True,
              'data': 'all',
              'seg': 'jieba',
              'prewordembedding': False,
              'attflag': True}

    dh = data_helper(config=config)

    #设置模型超参数,之所以要设置两个超参数是为了利用上一个超参数来命名模型
    config_network = {
        'HIDDEN_SIZE': 128,
        'NUM_LAYERS': 1,
        'SRC_VOCAB_SIZE': dh.vocab_size,
        'KEEP_PROB': 0.8,
        'MAX_GRAD_NORM': 5,
        'word_embedding_file': 'word_dic_jieba_embedding.pk' if config[
                                                                    'seg'] == 'jieba' else 'word_dic_nioseg_embedding.pk',
        'max_len':dh.max_len
    }

    model_name = tostring(config)
    CHECKPOINT_PATH = '../MODEL/' + model_name + '_ckpt'

    # 判断是否有同名模型存在
    if os.path.exists(CHECKPOINT_PATH + '.index'):
        exists_flag = True
    else:
        exists_flag = False

    train_model=ADEM_model(config, config_network)
    saver=tf.train.Saver()
    step=0
    max_loop=9999999
    min_=9999
    marks=[]

    #控制tensorflow的显存占比
    config_tf = tf.ConfigProto()
    config_tf.gpu_options.per_process_gpu_memory_fraction = 0.9
    config_tf.gpu_options.allow_growth = True

    with tf.Session(config=config_tf) as sess:
        tf.global_variables_initializer().run()
        #如果模型存在,那么载入后再做持续训练
        if exists_flag:
            print('loading trained model')
            saver = tf.train.Saver()
            saver.restore(sess, CHECKPOINT_PATH)

        for i in range(max_loop):
            context_input_, refrence_input_, model_input_, context_sequence_length_, \
            refrence_sequence_length_,model_sequence_length_,human_score_,grad_ys_=dh.next_batch(300)

            step=train_model.train_on_batch(sess,step,feed_dict={train_model.context_input:context_input_,
                                                                  train_model.context_sequence_length:context_sequence_length_,
                                                                  train_model.model_response_input:model_input_,
                                                                  train_model.refrence_response_input:refrence_input_,
                                                                  train_model.model_sequence_length:model_sequence_length_,
                                                                  train_model.refrence_sequence_length:refrence_sequence_length_,
                                                                  train_model.human_score:human_score_,
                                                                  train_model.grad_ys:grad_ys_
                                                                       })
            if i % 100 == 0 and i != 0:
                context_input_, refrence_input_, model_input_, context_sequence_length_, \
                refrence_sequence_length_, model_sequence_length_, human_score_,grad_ys_= dh.get_val_data()
                model_score=train_model.predict_on_batch(sess,feed_dict={train_model.context_input:context_input_,
                                                                  train_model.context_sequence_length:context_sequence_length_,
                                                                  train_model.model_response_input:model_input_,
                                                                  train_model.refrence_response_input:refrence_input_,
                                                                  train_model.model_sequence_length:model_sequence_length_,
                                                                  train_model.refrence_sequence_length:refrence_sequence_length_,
                                                                       })
                loss = train_model.mean_square_error(human_score_, model_score)
                if loss < min_:
                    min_ = loss
                    saver.save(sess, CHECKPOINT_PATH)
                marks.append([i, loss])
                for k in marks:
                    print(k)
                print(config)
Example #42
0
 def kmeans_wine(self):
     dh = data_helper()
     X_train, X_test, y_train, y_test = dh.get_wine_data()
     self.kmeans_analysis(X_train, X_test, y_train, y_test, 'Wine', 20)
Example #43
0
 def lda_nba(self):
     dh = data_helper()
     X_train, X_test, y_train, y_test = dh.get_nba_data()
     self.lda_analysis(X_train, X_test, y_train, y_test, 'NBA')
Example #44
0
 def kmeans_lda_pima(self):
     dh = data_helper()
     X_train, X_test, y_train, y_test = dh.get_pima_data_lda_best()
     self.part1.kmeans_analysis(X_train, X_test, y_train, y_test, 'Pima',
                                20, 'K-Means LDA')
Example #45
0
 def gmm_pima(self):
     dh = data_helper()
     X_train, X_test, y_train, y_test = dh.get_pima_data()
     self.gmm_analysis(X_train, X_test, y_train, y_test, 'Pima', 30)
Example #46
0
 def gmm_lda_pima(self):
     dh = data_helper()
     X_train, X_test, y_train, y_test = dh.get_pima_data_lda_best()
     self.part1.gmm_analysis(X_train, X_test, y_train, y_test, 'Pima', 20,
                             'GMM LDA')
Example #47
0
 def gmm_nba(self):
     dh = data_helper()
     X_train, X_test, y_train, y_test = dh.get_nba_data()
     self.gmm_analysis(X_train, X_test, y_train, y_test, 'NBA', 30)