Beispiel #1
0
    ]
    #~ l = [
    #~ [1,2,3,4,5,],
    #~ [1,2,3,4,5,],
    #~ [6,7,8,9,10,],
    #~ [6,7,8,9,10,],
    #~ [1,2,3,4,5,],
    #~ ]
    for d in l:
        yield d


if __name__ == "__main__":
    # Format: (name of analysis, number of topics, alpha, beta, burn, length, dataset feature vector iterator)
    given = [
        #~ ("test", 2, 0.1, 0.1, 100, 10, test_data),
        ('state_of_the_union', 5, 0.1, 0.1, 499, 1, state_of_the_union),
    ]

    for settings in given:
        analysis = LDA(settings[1], settings[2], settings[3], settings[4],
                       settings[5])
        print(settings[0])
        analysis.run_analysis(settings[6]())
        analysis.print_topics(10)
        with io.open('results_%s.json' % (settings[0]),
                     'w',
                     encoding='utf-8',
                     errors='ignore') as f:
            f.write(unicode(json.dumps(analysis.log_likelihoods)))
Beispiel #2
0
    vocab_list = [s[:-1] for s in f.readlines()]
vectorizer = CountVectorizer(vocabulary=vocab_list)

with open(sys.path[0] + '\\' + sys.argv[1], 'r') as f:
    corpus = [line[:-1] for line in f.readlines()]

X = vectorizer.fit_transform(corpus)
print len(vectorizer.vocabulary_)
print X.shape
vocab_list = sorted(vectorizer.vocabulary_,
                    key=lambda word: vectorizer.vocabulary_[word])
D, V = X.shape

n_topics = int(sys.argv[2])
n_iter = int(sys.argv[3]) if len(sys.argv) > 3 else 1000
lda = LDA(n_topics, D, V, 1. / n_topics, 1. / n_topics, 1, 0.51)
start_time = time.time()
lda.fit_batched(X, n_iter=n_iter)
end_time = time.time()
print
print 'Total time to fit LDA model: %.3f seconds' % (end_time - start_time)
sys.stdout.flush()
mean_dist = (lda.lmbda.T / lda.lmbda.sum(axis=1)).T
mean_dist_normalized = mean_dist - mean_dist.mean(axis=0)
for row in mean_dist_normalized:
    print[
        vocab_list[ind]
        for ind in sorted(range(len(row)), key=lambda ind: -row[ind])[0:20]
    ]
    sys.stdout.flush()
#print 'lambda (from my LDA):', lda.lmbda
Beispiel #3
0
plt.scatter(x1,
            x2,
            c=y,
            edgecolor='none',
            alpha=0.8,
            cmap=plt.cm.get_cmap('viridis', 3))

plt.xlabel('Temel Bileşen 1')
plt.ylabel('Temel Bileşen 2')
plt.colorbar()
plt.show()

# Verileri iki doğrusal diskriminant ile gösterme
from lda import LDA
lda = LDA(2)
lda.fit(X_min_max, y)
X_projected = lda.transform(X_min_max)

print('Min-Max Normalizasyonlu X:', X_min_max.shape)  # (150, 4)
print('LDA Uygulanan X:', X_projected.shape)  # (150, 2)

x1 = X_projected[:, 0]
x2 = X_projected[:, 1]

plt.scatter(x1,
            x2,
            c=y,
            edgecolor='none',
            alpha=0.8,
            cmap=plt.cm.get_cmap('viridis', 3))
Beispiel #4
0
from data import Data
from lda import LDA

data = Data()

data.load()
data.textPre('r')
tf = data.saveModel('r')

model = LDA()
model.fit(tf)
#model.print_top_words(data.tf_vectorizer.get_feature_names())

Beispiel #5
0
import time
import cPickle as pickle
import scipy.sparse

with open(sys.path[0] + '\\dict.txt', 'r') as f:
    vocab_list = [s[:-1] for s in f.readlines()]
vectorizer = CountVectorizer(vocabulary=vocab_list)

V = len(vectorizer.vocabulary)
n_topics = int(sys.argv[1])
batch_size = int(sys.argv[2])
n_iter = int(sys.argv[3])
kappa = float(sys.argv[4]) if len(sys.argv) > 4 else 0.51
D = batch_size * n_iter
max_retrieve = 64  # largest number of articles that are queried together in 1 function call
lda = LDA(n_topics, D, V, 1. / n_topics, 1. / n_topics, 1, kappa)

elbo_lst = []
scrape_time = 0.
examples = []
log_likelihoods = []
start_time_loop = time.time()
for t in range(n_iter):
    print '====================BATCH %d====================' % t
    sys.stdout.flush()
    articlenames = []
    n_requested = 0
    mats = []
    while n_requested < batch_size:
        request_size = min(batch_size - n_requested, max_retrieve)
        start_time = time.time()
Beispiel #6
0
import pandas as pd
from lda import LDA

df = pd.read_pickle("df.pkl")

punctuation = set("""!"#$%&\'()*+,-./:;<=>?@[\\]^_`{|}~""")
instances = [[lemma for lemmatized_sentence in lemmatized_speech for lemma in lemmatized_sentence if lemma not in punctuation] 
                    for lemmatized_speech in df.lemmas]

K = 50
beta = 0.01
epochs = 10000

lda = LDA(num_topics=K, corpus=instances, alpha=50/K, beta=beta, epochs=epochs, no_below=9, no_above=0.7)

pd.to_pickle(lda, "lda.pkl")
 def test_clean_text(self):
     bcr = LDA()
     text = 'Monkey jupyter alexander great text fish cat good great in it which is great alpha great'
     text = bcr.clean_text(text)
     print(text)
     self.assertNotIn('great', text)
Beispiel #8
0
 def setUp(self):
     self.description_csv = pd.read_csv("docs/description.csv")
     self.description_1000_csv = pd.read_csv("docs/description_1000.csv")
     self.dp = DocsPreprocessor()
     self.description_1000 = self.dp.process(self.description_1000_csv)
     self.lda = LDA(self.description_1000)
Beispiel #9
0
# bow = bow / bow.sum(axis=1)[:, None]

# Number of docs
n_docs = bow.shape[0]
# Number of unique words in the vocabulary
n_vocab = bow.shape[1]
# Number of dimensions in a single word vector
n_units = 256
# number of topics
n_topics = 20
batchsize = 128
counts = corpus.keys_counts[:n_vocab]
# Get the string representation for every compact key
words = corpus.word_list(vocab)[:n_vocab]

model = LDA(n_docs, n_topics, n_units, n_vocab)
if os.path.exists('lda.hdf5'):
    print("Reloading from saved")
    serializers.load_hdf5("lda.hdf5", model)
model.to_gpu()
optimizer = O.Adam()
optimizer.setup(model)

j = 0
fraction = batchsize * 1.0 / bow.shape[0]
for epoch in range(50000000):
    if epoch % 100 == 0:
        p = cuda.to_cpu(model.proportions.W.data).copy()
        f = cuda.to_cpu(model.factors.W.data).copy()
        w = cuda.to_cpu(model.embedding.W.data).copy()
        d = topics.prepare_topics(p, f, w, words)
Beispiel #10
0
def extract_aspects_from_reviews(K):
    raw_review_filename = 'raw_reviews.txt.ldapre'
    raw_texts = load_reviews(raw_review_filename)
    lda_model = LDA(K=K, doc_set=raw_texts)
    lda_model.train()
    lda_model.save(yelp_dir + 'review_t%s.lda' % K)
Beispiel #11
0
from lda import LDA
from dataset import TwentyNewsDataset
import time
dataset = TwentyNewsDataset()
dataset.load_data()
n_topics = 20

lda = LDA(n_topics)
lda.initialize(dataset.data_matrix)
lda.load_label('labels.txt', dataset.dictionary)
print(lda.print_labels())

for _ in range(100):
    lda.fit()

lda.get_topic_word()
lda.get_doc_topic()
lda.print_top_words(dataset.dictionary, 10)
Beispiel #12
0
def run_lda(data_dir, num_topics, use_mini_batches, batch_size, epochs,
            model_file, create_dict, dict_file, load_dbs):
    """ Run training and display test results if visualize is true

  Args:
    data_dir(str): directory containing director(y/ies) of data
    num_topics(int): Number of topics to train the model on
    batch_size(int): Size of mini batches used to train the model
    epochs(int): Number of epochs to train the data for on the train set
    model_file(str): saved model file to continue training on
    create_dict(bool): create dictionary from data or load dict from a file
    dict_file(str): dict_file path to load dictionary from 
    load_dbs(bool): if true, load databases from saved pickle files
  """

    assert (os.path.isdir(data_dir)), "Invalid data directory path"

    use_model_file = False
    if model_file:
        use_model_file = True

    #Create model
    lda = LDA(num_topics=num_topics)
    if create_dict:
        print 'Creating dictionary from data'
        #Create word to id mapping for all texts
        lda.create_dict(data_dir)
        lda.store_dict_to_disk('./dict/dictionary')
    else:
        print 'Loading existing dictionary...'
        lda.load_dict_from_disk(dict_file)

    #Iterate over all data and train model
    for root, dirs, files in os.walk(data_dir):
        if load_dbs:
            print 'Training will be done on existing databases'
            datum = files
        else:
            print 'Training will be done after creating databases from text files'
            datum = dirs
        #Iterate over sub-dirs
        for d in datum:
            db = None
            if not load_dbs:
                #Create database object
                db = Database(d, os.path.abspath(data_dir + '/' + d))
            else:
                db = Database()
                #Load database object from saved file
                db.load_from_disk(data_dir + '/' + d)

            #Add database to model
            lda.add_database(db)

            if use_model_file:
                #Load model paramaters from model file and call train
                lda.train(model_file,
                          db_name=db.get_name(),
                          use_mini_batches=use_mini_batches,
                          use_internal_dict=True,
                          batch_size=batch_size,
                          num_epochs=epochs)
                #Set to false, as we just need to load the model once and train it on the entire dataset
                use_model_file = False
            else:
                #Call train on the model
                lda.train(db_name=db.get_name(),
                          use_mini_batches=use_mini_batches,
                          use_internal_dict=True,
                          batch_size=batch_size,
                          num_epochs=epochs)
            if not load_dbs:
                #Remove db to free memory (can also save it if preferred)
                db.store_to_disk('./databases/' + d)

            lda.remove_database(db.get_name())
            del db
            gc.collect()
            tmp_file = './models/' + d + str(num_topics)
            lda.save_model(tmp_file)

    #Save final model
    file_name = './models/final' + str(num_topics)
    lda.save_model(file_name)
Beispiel #13
0
    def clustering_measure(self, n_cluster):
        km = KMeans(n_cluster)
        km.fit(self.doc_features)
        print("Adjusted Rand-Index: %.3f"
              % metrics.adjusted_rand_score(self.doc_class, km.labels_))

    def cross_validation(self):
        X_train, X_test, y_train, y_test = cross_validation.train_test_split(
            self.doc_features, self.doc_class, test_size=0.4, random_state=0)
        clf = svm.SVC(kernel='linear', C=1).fit(X_train, y_train)
        print ("Cross-Validation Score: %.3f" % clf.score(X_test, y_test))


if __name__ == '__main__':
    # load dataset
    dataset = CNN()
    dataset.load_data('/home/yi/Dropbox/workspace/data/cnn/')

    # train lda
    lda = LDA(5)
    lda.initialize(dataset.data_matrix)
    #lda.load_label('labels.txt', dataset.dictionary)
    for iter in range(20):
        lda.fit(dataset.data_matrix)
    lda.fininsh()
    lda.print_top_words(dataset.dictionary, 10)

    # evaluate lda
    eval = Evaluator(dataset, lda)
    eval.clustering_measure(n_cluster=5)
    eval.cross_validation()
Beispiel #14
0
	df = pd.read_csv(opt.dataset)
	articles = df['content']

# Generate the document term matrix and the vectorizer
	processed_articles = articles.apply(tokenizer)
	cv, dtm = document_term_matrix(processed_articles, opt.vectorizer, opt.min_df, opt.max_df)
# Generate the bag-of-words, the dictionary, and the word2vec model trained on the dataset
	bow, dictionary, w2v = get_dictionary(cv, articles, opt.min_df, opt.size, opt.sg)
# Define the corpus   
    corpus = DTMcorpus([i for i in bow])

# Find the optimum number of topics for LDA in a range from 2 to 50 topics
    coherence_scores = []

    for num_topics in range(2, 51):
        topic_list, _ = LDA(dtm, cv, opt.num_topics, opt.top_words) 

        coherence = CoherenceScores(topic_list)
        coherence_scores.append(coherence.c_v())
	
    optimum_num_topics = np.argmax(coherence_scores)

# Define the time slices, I use 10 years
    t1 = df[df['yearlo'] < 1970].sort_values(by='yearlo')
    t2 = df[(1970 <= df['yearlo']) & (df['yearlo'] < 1980)].sort_values(by='yearlo')
    t3 = df[(1980 <= df['yearlo']) & (df['yearlo'] < 1990)].sort_values(by='yearlo')
    t4 = df[(1990 <= df['yearlo']) & (df['yearlo'] < 2000)].sort_values(by='yearlo')
    t5 = df[(2000 <= df['yearlo']) & (df['yearlo'] < 2010)].sort_values(by='yearlo')
    t6 = df[2010 <= df['yearlo']].sort_values(by='yearlo')

    time_slices = [len(t1), len(t2), len(t3), len(t4), len(t5), len(t6)]
Beispiel #15
0
    def fit_model(self, data, params):
        from lda import LDA
        lda_instance = LDA(**params)
        lda_instance.fit(data)

        return lda_instance
Beispiel #16
0
import os
import sys
import numpy as np
import matplotlib.pyplot as plt

from lda import LDA

if __name__ == "__main__":
    lda = LDA(sys.argv[1], lda_weight=float(sys.argv[2]))

    with open(os.path.join(sys.argv[1], "file-list1"), 'r') as f:
        target_files = [i.strip() for i in f.readlines()]
    result = []
    for target_file in target_files:
        result.append(lda.retrieve_single(target_file, [5]))
    print(np.mean(np.where(np.array(result) == 1, 1, 0)))
    print(np.mean(np.where(np.array(result) <= 5, 1, 0)))
    plt.hist(result)
    plt.savefig("result.jpg")

    lda.retrieve_multiple(["D00076", "D01032", "D01350", "D02582", "D05005"],
                          [10, 5, 1])
Beispiel #17
0
def main():

    # Load dataset
    dataset = load_data()


    ##############
    # BASIC TEST #
    ##############

    # Setup
    lda = LDA()
    lda.dataset = copy.deepcopy(dataset)
    lda.run_setup()

    # Set hyper parameters
    lda.M_pca = 150
    lda.M_lda = 40

    # Run
    lda.run_pca_lda()
    err, y_pred = lda.run_nn_classifier()

    # find wrong classification
    err_index = 0
    for i in range(1,len(y_pred)):
        if not y_pred[i] == dataset['test_y'][i]:
            err_index = i
            break
        if y_pred[i] == dataset['test_y'][i]:
            corr_index = i

    correct_face = copy.deepcopy(dataset['test_x'][:,[err_index]])
    index = nn_classifier_index(lda.transform(correct_face),lda.transform(dataset['train_x']))
    wrong_face   = copy.deepcopy(dataset['train_x'][:,[index]])

    correct_face_2 = copy.deepcopy(dataset['test_x'][:,[corr_index]])
    index = nn_classifier_index(lda.transform(correct_face_2),lda.transform(dataset['train_x']))
    corr_face   = copy.deepcopy(dataset['train_x'][:,[index]])

    # plot both faces to compare
    plt.figure()
    f, ax = plt.subplots(2, 2, sharey=True)
    f.suptitle('PCA-LDA-NN wrong classification comparison')

    img = (correct_face).reshape((46,56))
    img = np.rot90(img,3)
    ax[0,0].imshow(img, cmap="gray")
    ax[0,0].axis('off')
    ax[0,0].set_title('Input Face')

    img = (wrong_face).reshape((46,56))
    img = np.rot90(img,3)
    ax[0,1].imshow(img, cmap="gray")
    ax[0,1].axis('off')
    ax[0,1].set_title('Wrong Prediction')

    img = (correct_face_2).reshape((46,56))
    img = np.rot90(img,3)
    ax[1,0].imshow(img, cmap="gray")
    ax[1,0].axis('off')
    ax[1,0].set_title('Input Face')

    img = (corr_face).reshape((46,56))
    img = np.rot90(img,3)
    ax[1,1].imshow(img, cmap="gray")
    ax[1,1].axis('off')
    ax[1,1].set_title('Correct Prediction')

    #plt.title('Comparison of reconstruction')
    plt.savefig("results/q3/wrong_pca_lda_nn_classifier.png", format="png", transparent=True)

    '''


    ######################
    # PCA-LDA EVALUATION #
    ######################

    # Evaluate for different M_pca
    M_pca = np.arange(75,300,10)
    M_lda = np.arange(20,100,10)

    err_results = [ [] for m in M_lda ]
    lda_index = 0

    for m_lda in M_lda:
        for m_pca in M_pca:
            if m_lda > m_pca:
                continue

            # Setup
            lda = LDA()
            lda.dataset = copy.deepcopy(dataset)
            lda.run_setup()

            # Set hyper parameters
            lda.M_pca = m_pca
            lda.M_lda = m_lda

            # Run
            lda.run_pca_lda()
            err,_ = lda.run_nn_classifier()

            print("M PCA: {}, M LDA: {}, ERROR: {}".format(m_pca,m_lda,err))

            err_results[lda_index].append(err)

        lda_index += 1

    fig = plt.figure()
    legends = [ '' for i in range(len(err_results)) ]
    for i in range(len(err_results)):
        legends[i], = plt.plot(M_pca,err_results[i],label='M lda = {}'.format(M_lda[i]))
    plt.legend(handles=legends)
    plt.show()

    '''

    '''
    ###################
    # PCA-LDA BAGGING #
    ###################

    # Number of machines
    NUM_MACHINES = 5

    # Machine Parameters
    M_pca = 100
    M_lda = 50
    sample_size = 300

    machine = [LDA() for i in range(NUM_MACHINES)]
    class_sizes = []

    for i in range(NUM_MACHINES):
        # Randomly sample training data TODO try stratified and un-stratified
        sample_index = sample_rnd(dataset['train_y'],sample_size)
        #sample_index = sample_stratified(dataset['train_y'],sample_size)

        # assign dataset for machine
        machine[i].dataset['train_x'] = copy.deepcopy(dataset['train_x'][:,sample_index])
        machine[i].dataset['train_y'] = copy.deepcopy(dataset['train_y'][sample_index])

        machine[i].dataset['test_x'] = copy.deepcopy(dataset['test_x'])
        machine[i].dataset['test_y'] = copy.deepcopy(dataset['test_y'])

        # Setup each machine
        machine[i].run_setup()
        machine[i].M_pca = M_pca
        machine[i].M_lda = M_lda

        class_sizes.append(machine[i].get_class_sizes())

    # variable to store label results
    labels =  [[] for i in range(NUM_MACHINES)]

    for i in range(NUM_MACHINES):
        machine[i].run_pca_lda()
        _, labels[i] = machine[i].run_nn_classifier()

    # get committee machine output
    labels_out = committe_machine_majority_vote(labels)
    err = identity_error(labels_out,dataset['test_y'])

    print('error(majority voting): ',err)

    # get committee machine output
    labels_out = committe_machine_weighted_voting(labels,class_sizes)
    err = identity_error(labels_out,dataset['test_y'])

    print('error(weighted voting): ',err)

    # get committee machine output (average)
    labels_out = committe_machine_average(labels)
    err = identity_error(labels_out,dataset['test_y'])

    print('error(average): ',err)

    '''


    ###################################
    # PCA-LDA PARAMETER RANDOMISATION #
    ###################################

    # Number of machines
    NUM_MACHINES = 15

    # Machine Parameters
    M0 = 125
    M1 = 25

    #M_pca = 100
    M_lda = 40
    #sample_size = 5

    machine = [LDA() for i in range(NUM_MACHINES)]

    for i in range(NUM_MACHINES):
        # Choose random eigenvectors for PCA
        M_pca = random_parameters(M0,M1,max_size=(len(dataset['train_y'])-1))

        # assign dataset for machine
        machine[i].dataset['train_x'] = copy.deepcopy(dataset['train_x'])
        machine[i].dataset['train_y'] = copy.deepcopy(dataset['train_y'])

        machine[i].dataset['test_x'] = copy.deepcopy(dataset['test_x'])
        machine[i].dataset['test_y'] = copy.deepcopy(dataset['test_y'])

        # Setup each machine
        machine[i].run_setup()
        machine[i].M_pca = M_pca
        machine[i].M_lda = M_lda

    # variable to store label results
    labels =  [[] for i in range(NUM_MACHINES)]

    for i in range(NUM_MACHINES):
        machine[i].run_pca_lda(m_pca_type=1)
        _, labels[i] = machine[i].run_nn_classifier()

    # get committee machine output
    labels_out = committe_machine_majority_vote(labels)
    err = identity_error(labels_out,dataset['test_y'])

    print('error(majority voting): ',err)

    # get committee machine output (average)
    labels_out = committe_machine_average(labels)
    err = identity_error(labels_out,dataset['test_y'])

    print('error(average): ',err)
    plot_confusion_matrix(dataset["test_y"], labels_out, "results/q3/lda_pca_ensemble_classifier_cm",normalize=True)




    ############################
    # ENSEMBLE HYPERPARAMETERS #
    ############################

    # Number of machines
    NUM_MACHINES = 50

    # List of errors
    err = [ [0,0] for i in range(NUM_MACHINES) ]
    err = [
        [0 for i in range(NUM_MACHINES) ],
        [0 for i in range(NUM_MACHINES) ]
    ]

    # HIGH CORRELATION #

    # Machine Parameters
    M0 = 125
    M1 = 25

    #M_pca = 100
    M_lda = 40
    #sample_size = 5

    machine = [LDA() for i in range(NUM_MACHINES)]

    for i in range(NUM_MACHINES):
        # Choose random eigenvectors for PCA
        M_pca = random_parameters(M0,M1,max_size=(len(dataset['train_y'])-1))

        # assign dataset for machine
        machine[i].dataset['train_x'] = copy.deepcopy(dataset['train_x'])
        machine[i].dataset['train_y'] = copy.deepcopy(dataset['train_y'])

        machine[i].dataset['test_x'] = copy.deepcopy(dataset['test_x'])
        machine[i].dataset['test_y'] = copy.deepcopy(dataset['test_y'])

        # Setup each machine
        machine[i].run_setup()
        machine[i].M_pca = M_pca
        machine[i].M_lda = M_lda

    # variable to store label results
    labels =  [[] for i in range(NUM_MACHINES)]

    for i in range(NUM_MACHINES):
        machine[i].run_pca_lda(m_pca_type=1)
        _, labels[i] = machine[i].run_nn_classifier()

    # get committee machine output
    for i in range(NUM_MACHINES):
        labels_out = committe_machine_majority_vote(labels[:(i+1)])
        err[0][i]  = identity_error(labels_out,dataset['test_y'])

    # LOW CORRELATION #

    # Machine Parameters
    M0 = 25
    M1 = 125

    #M_pca = 100
    M_lda = 40
    #sample_size = 5

    machine = [LDA() for i in range(NUM_MACHINES)]

    for i in range(NUM_MACHINES):
        # Choose random eigenvectors for PCA
        M_pca = random_parameters(M0,M1,max_size=(len(dataset['train_y'])-1))

        # assign dataset for machine
        machine[i].dataset['train_x'] = copy.deepcopy(dataset['train_x'])
        machine[i].dataset['train_y'] = copy.deepcopy(dataset['train_y'])

        machine[i].dataset['test_x'] = copy.deepcopy(dataset['test_x'])
        machine[i].dataset['test_y'] = copy.deepcopy(dataset['test_y'])

        # Setup each machine
        machine[i].run_setup()
        machine[i].M_pca = M_pca
        machine[i].M_lda = M_lda

    # variable to store label results
    labels =  [[] for i in range(NUM_MACHINES)]

    for i in range(NUM_MACHINES):
        machine[i].run_pca_lda(m_pca_type=1)
        _, labels[i] = machine[i].run_nn_classifier()

    # get committee machine output
    for i in range(NUM_MACHINES):
        labels_out = committe_machine_majority_vote(labels[:(i+1)])
        err[1][i]  = identity_error(labels_out,dataset['test_y'])

    plt.figure()
    plt.title('Comparison of Different Comittee Machines')
    plt.xlabel('Number of Machines')
    plt.ylabel('Error (%)')
    plt.plot(range(NUM_MACHINES),err[0], label="High Machine Correlation")
    plt.plot(range(NUM_MACHINES),err[1], label="Low Machine Correlation")
    plt.legend()
    plt.savefig('results/q3/num_machines_eval.png',
                format='png', transparent=True)
Beispiel #18
0
    return vote_time.group()


def parse_topic_from_html(tree):
    """ Parse and clear vote topic from html """

    vote_topic = tree.xpath('//span[@class="rvts1"]/text()')[1]
    vote_topic = sub(r'[^{0}]'.format(UKR_ALPHABET), '', vote_topic.lower())
    return sub(r' {2,}', ' ', vote_topic)


def load_factions(filename='json/factions.json'):
    """ Load factions json as dictionary """

    with open(filename, encoding=ENCODING) as file:
        factions = loads(file.read())

    for key in factions:
        if factions[key][0][0] == '\ufeff':
            factions[key][0] = factions[key][0][1:]

    return factions


if __name__ == '__main__':
    data_path = Path(__file__).absolute().ancestor(2).child('Data').child(
        'html')

    # parse_vote_topics(data_path, first_of=100)
    parse_html(data_path, LDA(), first_of=10)
Beispiel #19
0
# In[ ]:

#scikit-learn LDA implementation
#201
#1121
#4617
#model=LatentDirichletAllocation(n_topics=num_topics,max_iter=100,learning_method='batch',random_state=201)#,doc_topic_prior=50.0/num_topics,topic_word_prior=200.0/num_topics)
#model.fit(bag_of_words)

# In[ ]:

# In[ ]:

#lda implementation from https://github.com/ariddell/lda using collapsed gibbs sampling
model = LDA(n_topics=num_topics, n_iter=1000, random_state=201, refresh=100)
model.fit(bag_of_words)  # model.fit_transform(X) is also available
#topic_word = model.topic_word_  # model.components_ also works

# In[ ]:


def print_top_words(model, feature_names, n_top_words):
    for topic_idx, topic in enumerate(model.components_):
        print("Topic #%d:" % topic_idx)
        print(" ".join(
            [feature_names[i] for i in topic.argsort()[:-n_top_words - 1:-1]]))


# In[ ]:
Beispiel #20
0
    X_train = eval("X_%s_train" % l)
    y_train = eval("y_%s_train" % l)
    X_test = eval("X_%s_test" % l)
    y_test = eval("y_%s_test" % l)

    X_combined = np.vstack((X_train, X_test))
    y_combined = np.hstack((y_train, y_test))

    fig, ax = plt.subplots(nrows=2, ncols=2, figsize=(16, 18))
    ax = ax.flatten()

    print(l)
    """
    Run LDA
    """
    LDA_clf = LDA()
    LDA_clf.fit(X_train, y_train)

    lda_train_error = np.mean(LDA_clf.predict(X_train).flatten() != y_train)
    lda_test_error = np.mean(LDA_clf.predict(X_test).flatten() != y_test)

    plot_decision_regions(X=X_combined,
                          y=y_combined,
                          classifier=LDA_clf,
                          test_idx=range(X_train.shape[0],
                                         X_train.shape[0] + X_test.shape[0]),
                          ax=ax[0])
    ax[0].set_xlabel("x1", fontsize="large")
    ax[0].set_ylabel("x2", fontsize="large")
    ax[0].legend(loc="upper right", fontsize="large")
    ax[0].set_title("Generative model (LDA) on dataset %s" % l,
Beispiel #21
0
print('generating training/testing corpus...')
corpus = Corpus()
corpus.generate_corpus_from_graph_using_SIP(graph, '012-SIP')
train_corpus, test_corpus = corpus_split(corpus)

# stochastic variational inference
hyper_params_svb = {}
hyper_params_svb['num_topics'] = K
hyper_params_svb['alpha'] = alpha  # uniform [1/K, ..., 1/K]
hyper_params_svb['eta'] = eta  # uniform [1/K, ..., 1/K]
hyper_params_svb['size_vocab'] = graph.n
hyper_params_svb['num_docs'] = train_corpus.num_docs
hyper_params_svb['tau0'] = tau0
hyper_params_svb['kappa'] = kappa

lda_svb = LDA(hyper_params_svb, 'SVB')
log_file = open(f_log, "w")
log_file.write("iteration time rthot held-out log-perplexity estimate\n")

total_time = 0
D = train_corpus.num_docs
max_iter_per_epoch = np.ceil(D / batchsize)

print('stochastic variational inference...')
for epoch in range(epochs):
    iter = 0
    printProgress(iter,
                  max_iter_per_epoch,
                  prefix='epoch %s' % int(epoch + 1),
                  suffix='complete',
                  barLength=50)
Beispiel #22
0
    def run(self, mode,  cntStatus = True, saveVid = False, showVid = True ):
        lbp = lbp_feature()
        # neural_network = neural_net(75, 3)
        # neural_network.create_struct(150)
        # neural_network.load_model(settings.STATICFILES_DIRS[0])
        lda = LDA(75, 3)
        #lda.create_struct(150)
        if mode == 'predict':
            lda.load_model(settings.STATICFILES_DIRS[0])
        self.video.set(cv2.cv.CV_CAP_PROP_POS_MSEC, 0)
        kernel = np.ones((10, 10), np.uint8)
        lanes = [[] for x in range(self.totalLane)]
        totalCars = [0] * self.totalLane
        num_car_detect = 0
        self.timer = threading.Timer(5.0, self.progress)
        self.timer.start()
        while self.video.isOpened():
            ret, frame = self.video.read()
            if not ret:
                break
            frameOrigin = deepcopy(frame)
            res = frame
            self.num_frame +=1
            for point in self.lanePoints:
                cv2.polylines(frame, [point], True, (0, 255, 0), 3)

            filteredFrame = cv2.GaussianBlur(frame, (5, 5), 0)
            if self.fgMask is None:
                self.fgMask = self.subtractor.apply(filteredFrame, -1)
                test = deepcopy(self.fgMask)
            self.fgMask = self.subtractor.apply(filteredFrame, self.fgMask, -1)
            self.fgMask = cv2.dilate(self.fgMask, kernel, iterations=1)
            self.fgMask = cv2.erode(self.fgMask, kernel, iterations=1)

            self.fgMask = cv2.morphologyEx(self.fgMask, cv2.MORPH_CLOSE, np.ones((30, 30), np.uint8))
            self.fgMask = cv2.morphologyEx(self.fgMask, cv2.MORPH_CLOSE, np.ones((30, 30), np.uint8))
            self.fgMask = cv2.morphologyEx(self.fgMask, cv2.MORPH_OPEN, np.ones((5, 5), np.uint8))
            tempMask = deepcopy(self.fgMask)
            carImg = cv2.bitwise_and(frameOrigin, frameOrigin, mask=self.fgMask)
# Section tracking and Detection
            contours, hrc = cv2.findContours(tempMask, cv2.RETR_EXTERNAL, cv2.CHAIN_APPROX_TC89_KCOS)
            isIn = [False] * self.totalLane
            laneObj = [[] for x in range(self.totalLane)]

            outLane = [[] for x in range(self.totalLane)]
            for obj in contours:
                moment = cv2.moments(obj)
                if moment['m00'] == 0:
                    continue
                cx = int(moment['m10']/moment['m00'])
                cy = int(moment['m01']/moment['m00'])
                pX, pY, w, h = cv2.boundingRect(obj)

                isNotLane = True
                for numLane in range(len(self.laneContours)):
                    if cv2.pointPolygonTest(self.laneContours[numLane][0], (cx, cy), False) == 1:
                        car_object = {"centroid": (cx, cy+h/2), "origin": (pX, pY), "height": h, "width": w}
                        laneObj[numLane].append(car_object)
                        isNotLane = False
                        break
                if isNotLane:
                    for numLane in range(len(self.laneContours)):
                        lanePoint =  self.lanePoints[numLane]

                        if cx >= lanePoint[3][0][0] and cx <= lanePoint[2][0][0]\
                                and cy >= lanePoint[3][0][1]  and cy <= lanePoint[3][0][1]+50:
                            car_object = {"centroid": (cx, cy+h/2), "origin": (pX, pY), "height": h, "width": w}
                            outLane[numLane].append(car_object)

            for numLane in range(len(self.laneContours)):
                for i in outLane[numLane]:
                    diffRange = 50
                    foundedObj = None
                    for j in lanes[numLane]:
                        diff = math.fabs(j["point"][0][0] - i["centroid"][0]) + math.fabs(j["point"][0][1] - i["centroid"][1])
                        if diff < diffRange:
                            diffRange = diff
                            foundedObj = j
                    if foundedObj is not None:
                        totalCars[numLane] += 1
                        originX = i["origin"][0]
                        originY = i["origin"][1]
                        crop_img = frameOrigin[originY:originY + i["height"], originX:originX+i["width"]]
                        normal_image = cv2.resize(crop_img, (64, 64))
                        num_car_detect += 1
                        if mode == 'train':
                            directory = settings.STATICFILES_DIRS[0]+'main_app/media/train_image/'
                            if not os.path.exists(directory):
                                os.makedirs(directory)
                            cv2.imwrite(directory + 'car'+str(num_car_detect)+'.png', crop_img)
                        if mode == 'predict':
                            height, width, channels = crop_img.shape
                            size_data = [height/100.0, width/100.0, height * width/10000.0]
                            lbp.read_image(normal_image)
                            feature = lbp.extract_feature(size_data[0], size_data[1], size_data[2])
                            #answer = neural_network.predict(feature)
                            answer = int(lda.predict(feature))
                            save_type(self.video_name, answer, self.num_frame)
                            if answer == 2:
                                self.typeCar["small"] += 1
                            elif answer == 1:
                                self.typeCar["medium"] += 1
                            else:
                                self.typeCar["large"] += 1
                            print answer
                            file_name = self.video_name[:self.video_name.find('.avi')] + '.png'
                            path = settings.STATICFILES_DIRS[0]+'main_app/media/result_image/'+str(num_car_detect)+'-'+str(answer)+'-'+file_name
                            cv2.imwrite(path, crop_img)
                        lanes[numLane].remove(foundedObj)

                for i in lanes[numLane]:
                    i["stat"] = False
                for i in laneObj[numLane]:
                    diffRange = 50
                    foundedObj = None
                    for j in lanes[numLane]:
                        diff = math.fabs(j["point"][0][0] - i["centroid"][0]) + math.fabs(j["point"][0][1] - i["centroid"][1])

                        if diff < diffRange:
                            diffRange = diff
                            foundedObj = j
                    if foundedObj is not None:
                        foundedObj["point"].insert(0, i["centroid"])
                        foundedObj["stat"] = True
                    else:
                        lanes[numLane].append({ "point": [i["centroid"]], "stat": True })
                tempLane = []
                for i in lanes[numLane]:
                    if i["stat"]:
                        tempLane.append(i)
                        cv2.polylines(res, np.int32([i["point"]]), False, (0, 255, 255), 3)
                lanes[numLane] = tempLane

# Section Draw TrackLine
            for obj in contours:
                moment = cv2.moments(obj)
                if moment['m00'] == 0:
                    continue
                pX, pY, w, h = cv2.boundingRect(obj)
                cx = int(moment['m10']/moment['m00'])
                cy = int(moment['m01']/moment['m00'])+h/2
                cv2.circle(res, (cx, cy), 3, (0, 0, 255), 4)
                distance = []
                for numLane in range(len(self.laneContours)):
                    distance.append(cv2.pointPolygonTest(self.laneContours[numLane][0], (cx, cy), False))
                for numLane in range(len(self.laneContours)):
                    if distance[numLane] == 1:
                        isIn[numLane] = True
                        cv2.rectangle(res, (pX, pY), (pX+w, pY+h), (0, 255, 255), 2)
                        if self.lanes[numLane]["is_empty"]:
                            self.lanes[numLane]["is_empty"] = False
                            self.lanes[numLane]["pts"].append((cx, cy))
                        else:
                            self.lanes[numLane]["pts"].insert(0, (cx, cy))
                        break
                    else:
                        cv2.rectangle(res, (pX, pY), (pX+w, pY+h), (255, 255, 0), 2)
            for i in range(0, self.totalLane):
                if isIn[i]:
                    if showVid:
                        pass
                else:
                    self.lanes[numLane]["is_empty"] = True
                    self.lanes[numLane]["pts"] = []
            if cntStatus:
                cv2.putText(res, 'lane1: '+str(totalCars[0]), (10, 30), cv2.FONT_HERSHEY_SIMPLEX, 1, (0, 255, 0), 2)
                cv2.putText(res, 'lane2: '+str(totalCars[1]), (10, 70), cv2.FONT_HERSHEY_SIMPLEX, 1, (125, 0, 255), 2)
                cv2.putText(res, 'truck/bus: '+str(self.typeCar["large"]), (400, 30), cv2.FONT_HERSHEY_SIMPLEX, 1, (255, 255, 255), 2)
                cv2.putText(res, 'small car: '+str(self.typeCar["medium"]), (400, 70), cv2.FONT_HERSHEY_SIMPLEX, 1, (255, 255, 255), 2)
                cv2.putText(res, 'motorcycle: '+str(self.typeCar["small"]), (400, 110), cv2.FONT_HERSHEY_SIMPLEX, 1, (255, 255, 255), 2)

            if showVid:
                resMask = cv2.bitwise_and(frame, frame, mask=~self.fgMask)
                cv2.imshow('frame', res)
                if cv2.waitKey(5) & 0xFF == ord('q'):
                    cv2.imwrite('tesf.png', frameOrigin)
                    cv2.imwrite('tesM.png', self.fgMask)
                    break
        self.timer.cancel()
        update_progress(self.video_name, self.num_frame, self.total_frame)
        print totalCars
        self.video.release()
        cv2.destroyAllWindows()
        print self.typeCar
Beispiel #23
0
#%% load
print('input tokens from preprocessing pipeline %d' % toks)

print('loading DTM from `%s`...' % DATA_PICKLE_DTM)
doc_labels, vocab, dtm, tokens = unpickle_file(DATA_PICKLE_DTM)
assert len(doc_labels) == dtm.shape[0]
assert len(vocab) == dtm.shape[1]
print('loaded DTM with %d documents, %d vocab size, %d tokens' %
      (len(doc_labels), len(vocab), dtm.sum()))

#%% compute model

print('generating model with parameters:')
pprint(LDA_PARAMS)

model = LDA(**LDA_PARAMS)
model.fit(dtm)

#%% output

print('saving model to `%s`' % LDA_MODEL_PICKLE)
pickle_data((doc_labels, vocab, dtm, model), LDA_MODEL_PICKLE)

print('saving results to `%s`' % LDA_MODEL_EXCEL_OUTPUT)
save_ldamodel_summary_to_excel(LDA_MODEL_EXCEL_OUTPUT,
                               model.topic_word_,
                               model.doc_topic_,
                               doc_labels,
                               vocab,
                               dtm=dtm)
Beispiel #24
0
    def fit_model(self, data, params):
        lda_instance = LDA(**params)
        lda_instance.fit(data)

        return lda_instance
def index():
    core = TermiteCore(request, response)
    lda = LDA(request)
    return core.GenerateResponse(lda.params)
Beispiel #26
0
from sklearn import datasets
import matplotlib.pyplot as plt
import numpy as np

from lda import LDA
data = datasets.load_iris()
X = data.data
y = data.target

lda = LDA(n_components=2)
lda.fit(X, y)

X_projected = lda.transform(X)
print("shape of X:", X.shape)
print("shape of transform X ", X_projected.shape)

x1 = X_projected[:, 0]
x2 = X_projected[:, 1]

plt.scatter(x1,
            x2,
            c=y,
            edgecolors='none',
            alpha=0.8,
            cmap=plt.cm.get_cmap('viridis', 3))

plt.colorbar()
plt.show()
def TopicCooccurrence():
    core = TermiteCore(request, response)
    lda = LDA(request)
    topicCooccurrence = lda.GetTopicCooccurrence()
    return core.GenerateResponse(lda.params,
                                 {'TopicCooccurrence': topicCooccurrence})
Beispiel #28
0
def load_wakati_docs(filename):
    # 1行1文書の分かち書き済みテキストファイル
    texts = []
    for line in open(filename, 'r'):
        texts.append(line.split(' '))
    return texts


def create_dict_and_corpus(texts, no_below=10, no_above=0.2):
    # 辞書の作成
    dictionary = corpora.Dictionary(texts)
    dictionary.filter_extremes(no_below=no_below, no_above=no_above)
    dictionary.save_as_text('./docs.dic')
    # コーパス作成
    corpus = [dictionary.doc2bow(text) for text in texts]
    corpora.MmCorpus.serialize('./corpus.mm', corpus)
    return dictionary, corpus


if __name__ == "__main__":
    import sys
    from lda import LDA
    import time
    texts = load_wakati_docs(sys.argv[1])
    dictionary, corpus = create_dict_and_corpus(texts)
    t0 = time.time()
    lda = LDA(corpus, dictionary, num_topic=10, iterations=1000)
    t1 = time.time()
    lda.show_topics()
    print("Elapsed time: {}".format(t1 - t0))
Beispiel #29
0
from lda import LDA

train_corpus = 'data/worldnews_train.csv'
test_corpus = 'data/worldnews_test.csv'

alpha = 0.01
beta = 0.01
topics = 5

model = LDA(topics, alpha, beta)
model.fit(train_corpus, n_iters=10000, burn=8000)

model.print_topics()

x = input('Press key to start evaluation')

model.predict(test_corpus, n_iters=1000, burn=300)

model.print_eval_results()