] #~ l = [ #~ [1,2,3,4,5,], #~ [1,2,3,4,5,], #~ [6,7,8,9,10,], #~ [6,7,8,9,10,], #~ [1,2,3,4,5,], #~ ] for d in l: yield d if __name__ == "__main__": # Format: (name of analysis, number of topics, alpha, beta, burn, length, dataset feature vector iterator) given = [ #~ ("test", 2, 0.1, 0.1, 100, 10, test_data), ('state_of_the_union', 5, 0.1, 0.1, 499, 1, state_of_the_union), ] for settings in given: analysis = LDA(settings[1], settings[2], settings[3], settings[4], settings[5]) print(settings[0]) analysis.run_analysis(settings[6]()) analysis.print_topics(10) with io.open('results_%s.json' % (settings[0]), 'w', encoding='utf-8', errors='ignore') as f: f.write(unicode(json.dumps(analysis.log_likelihoods)))
vocab_list = [s[:-1] for s in f.readlines()] vectorizer = CountVectorizer(vocabulary=vocab_list) with open(sys.path[0] + '\\' + sys.argv[1], 'r') as f: corpus = [line[:-1] for line in f.readlines()] X = vectorizer.fit_transform(corpus) print len(vectorizer.vocabulary_) print X.shape vocab_list = sorted(vectorizer.vocabulary_, key=lambda word: vectorizer.vocabulary_[word]) D, V = X.shape n_topics = int(sys.argv[2]) n_iter = int(sys.argv[3]) if len(sys.argv) > 3 else 1000 lda = LDA(n_topics, D, V, 1. / n_topics, 1. / n_topics, 1, 0.51) start_time = time.time() lda.fit_batched(X, n_iter=n_iter) end_time = time.time() print print 'Total time to fit LDA model: %.3f seconds' % (end_time - start_time) sys.stdout.flush() mean_dist = (lda.lmbda.T / lda.lmbda.sum(axis=1)).T mean_dist_normalized = mean_dist - mean_dist.mean(axis=0) for row in mean_dist_normalized: print[ vocab_list[ind] for ind in sorted(range(len(row)), key=lambda ind: -row[ind])[0:20] ] sys.stdout.flush() #print 'lambda (from my LDA):', lda.lmbda
plt.scatter(x1, x2, c=y, edgecolor='none', alpha=0.8, cmap=plt.cm.get_cmap('viridis', 3)) plt.xlabel('Temel Bileşen 1') plt.ylabel('Temel Bileşen 2') plt.colorbar() plt.show() # Verileri iki doğrusal diskriminant ile gösterme from lda import LDA lda = LDA(2) lda.fit(X_min_max, y) X_projected = lda.transform(X_min_max) print('Min-Max Normalizasyonlu X:', X_min_max.shape) # (150, 4) print('LDA Uygulanan X:', X_projected.shape) # (150, 2) x1 = X_projected[:, 0] x2 = X_projected[:, 1] plt.scatter(x1, x2, c=y, edgecolor='none', alpha=0.8, cmap=plt.cm.get_cmap('viridis', 3))
from data import Data from lda import LDA data = Data() data.load() data.textPre('r') tf = data.saveModel('r') model = LDA() model.fit(tf) #model.print_top_words(data.tf_vectorizer.get_feature_names())
import time import cPickle as pickle import scipy.sparse with open(sys.path[0] + '\\dict.txt', 'r') as f: vocab_list = [s[:-1] for s in f.readlines()] vectorizer = CountVectorizer(vocabulary=vocab_list) V = len(vectorizer.vocabulary) n_topics = int(sys.argv[1]) batch_size = int(sys.argv[2]) n_iter = int(sys.argv[3]) kappa = float(sys.argv[4]) if len(sys.argv) > 4 else 0.51 D = batch_size * n_iter max_retrieve = 64 # largest number of articles that are queried together in 1 function call lda = LDA(n_topics, D, V, 1. / n_topics, 1. / n_topics, 1, kappa) elbo_lst = [] scrape_time = 0. examples = [] log_likelihoods = [] start_time_loop = time.time() for t in range(n_iter): print '====================BATCH %d====================' % t sys.stdout.flush() articlenames = [] n_requested = 0 mats = [] while n_requested < batch_size: request_size = min(batch_size - n_requested, max_retrieve) start_time = time.time()
import pandas as pd from lda import LDA df = pd.read_pickle("df.pkl") punctuation = set("""!"#$%&\'()*+,-./:;<=>?@[\\]^_`{|}~""") instances = [[lemma for lemmatized_sentence in lemmatized_speech for lemma in lemmatized_sentence if lemma not in punctuation] for lemmatized_speech in df.lemmas] K = 50 beta = 0.01 epochs = 10000 lda = LDA(num_topics=K, corpus=instances, alpha=50/K, beta=beta, epochs=epochs, no_below=9, no_above=0.7) pd.to_pickle(lda, "lda.pkl")
def test_clean_text(self): bcr = LDA() text = 'Monkey jupyter alexander great text fish cat good great in it which is great alpha great' text = bcr.clean_text(text) print(text) self.assertNotIn('great', text)
def setUp(self): self.description_csv = pd.read_csv("docs/description.csv") self.description_1000_csv = pd.read_csv("docs/description_1000.csv") self.dp = DocsPreprocessor() self.description_1000 = self.dp.process(self.description_1000_csv) self.lda = LDA(self.description_1000)
# bow = bow / bow.sum(axis=1)[:, None] # Number of docs n_docs = bow.shape[0] # Number of unique words in the vocabulary n_vocab = bow.shape[1] # Number of dimensions in a single word vector n_units = 256 # number of topics n_topics = 20 batchsize = 128 counts = corpus.keys_counts[:n_vocab] # Get the string representation for every compact key words = corpus.word_list(vocab)[:n_vocab] model = LDA(n_docs, n_topics, n_units, n_vocab) if os.path.exists('lda.hdf5'): print("Reloading from saved") serializers.load_hdf5("lda.hdf5", model) model.to_gpu() optimizer = O.Adam() optimizer.setup(model) j = 0 fraction = batchsize * 1.0 / bow.shape[0] for epoch in range(50000000): if epoch % 100 == 0: p = cuda.to_cpu(model.proportions.W.data).copy() f = cuda.to_cpu(model.factors.W.data).copy() w = cuda.to_cpu(model.embedding.W.data).copy() d = topics.prepare_topics(p, f, w, words)
def extract_aspects_from_reviews(K): raw_review_filename = 'raw_reviews.txt.ldapre' raw_texts = load_reviews(raw_review_filename) lda_model = LDA(K=K, doc_set=raw_texts) lda_model.train() lda_model.save(yelp_dir + 'review_t%s.lda' % K)
from lda import LDA from dataset import TwentyNewsDataset import time dataset = TwentyNewsDataset() dataset.load_data() n_topics = 20 lda = LDA(n_topics) lda.initialize(dataset.data_matrix) lda.load_label('labels.txt', dataset.dictionary) print(lda.print_labels()) for _ in range(100): lda.fit() lda.get_topic_word() lda.get_doc_topic() lda.print_top_words(dataset.dictionary, 10)
def run_lda(data_dir, num_topics, use_mini_batches, batch_size, epochs, model_file, create_dict, dict_file, load_dbs): """ Run training and display test results if visualize is true Args: data_dir(str): directory containing director(y/ies) of data num_topics(int): Number of topics to train the model on batch_size(int): Size of mini batches used to train the model epochs(int): Number of epochs to train the data for on the train set model_file(str): saved model file to continue training on create_dict(bool): create dictionary from data or load dict from a file dict_file(str): dict_file path to load dictionary from load_dbs(bool): if true, load databases from saved pickle files """ assert (os.path.isdir(data_dir)), "Invalid data directory path" use_model_file = False if model_file: use_model_file = True #Create model lda = LDA(num_topics=num_topics) if create_dict: print 'Creating dictionary from data' #Create word to id mapping for all texts lda.create_dict(data_dir) lda.store_dict_to_disk('./dict/dictionary') else: print 'Loading existing dictionary...' lda.load_dict_from_disk(dict_file) #Iterate over all data and train model for root, dirs, files in os.walk(data_dir): if load_dbs: print 'Training will be done on existing databases' datum = files else: print 'Training will be done after creating databases from text files' datum = dirs #Iterate over sub-dirs for d in datum: db = None if not load_dbs: #Create database object db = Database(d, os.path.abspath(data_dir + '/' + d)) else: db = Database() #Load database object from saved file db.load_from_disk(data_dir + '/' + d) #Add database to model lda.add_database(db) if use_model_file: #Load model paramaters from model file and call train lda.train(model_file, db_name=db.get_name(), use_mini_batches=use_mini_batches, use_internal_dict=True, batch_size=batch_size, num_epochs=epochs) #Set to false, as we just need to load the model once and train it on the entire dataset use_model_file = False else: #Call train on the model lda.train(db_name=db.get_name(), use_mini_batches=use_mini_batches, use_internal_dict=True, batch_size=batch_size, num_epochs=epochs) if not load_dbs: #Remove db to free memory (can also save it if preferred) db.store_to_disk('./databases/' + d) lda.remove_database(db.get_name()) del db gc.collect() tmp_file = './models/' + d + str(num_topics) lda.save_model(tmp_file) #Save final model file_name = './models/final' + str(num_topics) lda.save_model(file_name)
def clustering_measure(self, n_cluster): km = KMeans(n_cluster) km.fit(self.doc_features) print("Adjusted Rand-Index: %.3f" % metrics.adjusted_rand_score(self.doc_class, km.labels_)) def cross_validation(self): X_train, X_test, y_train, y_test = cross_validation.train_test_split( self.doc_features, self.doc_class, test_size=0.4, random_state=0) clf = svm.SVC(kernel='linear', C=1).fit(X_train, y_train) print ("Cross-Validation Score: %.3f" % clf.score(X_test, y_test)) if __name__ == '__main__': # load dataset dataset = CNN() dataset.load_data('/home/yi/Dropbox/workspace/data/cnn/') # train lda lda = LDA(5) lda.initialize(dataset.data_matrix) #lda.load_label('labels.txt', dataset.dictionary) for iter in range(20): lda.fit(dataset.data_matrix) lda.fininsh() lda.print_top_words(dataset.dictionary, 10) # evaluate lda eval = Evaluator(dataset, lda) eval.clustering_measure(n_cluster=5) eval.cross_validation()
df = pd.read_csv(opt.dataset) articles = df['content'] # Generate the document term matrix and the vectorizer processed_articles = articles.apply(tokenizer) cv, dtm = document_term_matrix(processed_articles, opt.vectorizer, opt.min_df, opt.max_df) # Generate the bag-of-words, the dictionary, and the word2vec model trained on the dataset bow, dictionary, w2v = get_dictionary(cv, articles, opt.min_df, opt.size, opt.sg) # Define the corpus corpus = DTMcorpus([i for i in bow]) # Find the optimum number of topics for LDA in a range from 2 to 50 topics coherence_scores = [] for num_topics in range(2, 51): topic_list, _ = LDA(dtm, cv, opt.num_topics, opt.top_words) coherence = CoherenceScores(topic_list) coherence_scores.append(coherence.c_v()) optimum_num_topics = np.argmax(coherence_scores) # Define the time slices, I use 10 years t1 = df[df['yearlo'] < 1970].sort_values(by='yearlo') t2 = df[(1970 <= df['yearlo']) & (df['yearlo'] < 1980)].sort_values(by='yearlo') t3 = df[(1980 <= df['yearlo']) & (df['yearlo'] < 1990)].sort_values(by='yearlo') t4 = df[(1990 <= df['yearlo']) & (df['yearlo'] < 2000)].sort_values(by='yearlo') t5 = df[(2000 <= df['yearlo']) & (df['yearlo'] < 2010)].sort_values(by='yearlo') t6 = df[2010 <= df['yearlo']].sort_values(by='yearlo') time_slices = [len(t1), len(t2), len(t3), len(t4), len(t5), len(t6)]
def fit_model(self, data, params): from lda import LDA lda_instance = LDA(**params) lda_instance.fit(data) return lda_instance
import os import sys import numpy as np import matplotlib.pyplot as plt from lda import LDA if __name__ == "__main__": lda = LDA(sys.argv[1], lda_weight=float(sys.argv[2])) with open(os.path.join(sys.argv[1], "file-list1"), 'r') as f: target_files = [i.strip() for i in f.readlines()] result = [] for target_file in target_files: result.append(lda.retrieve_single(target_file, [5])) print(np.mean(np.where(np.array(result) == 1, 1, 0))) print(np.mean(np.where(np.array(result) <= 5, 1, 0))) plt.hist(result) plt.savefig("result.jpg") lda.retrieve_multiple(["D00076", "D01032", "D01350", "D02582", "D05005"], [10, 5, 1])
def main(): # Load dataset dataset = load_data() ############## # BASIC TEST # ############## # Setup lda = LDA() lda.dataset = copy.deepcopy(dataset) lda.run_setup() # Set hyper parameters lda.M_pca = 150 lda.M_lda = 40 # Run lda.run_pca_lda() err, y_pred = lda.run_nn_classifier() # find wrong classification err_index = 0 for i in range(1,len(y_pred)): if not y_pred[i] == dataset['test_y'][i]: err_index = i break if y_pred[i] == dataset['test_y'][i]: corr_index = i correct_face = copy.deepcopy(dataset['test_x'][:,[err_index]]) index = nn_classifier_index(lda.transform(correct_face),lda.transform(dataset['train_x'])) wrong_face = copy.deepcopy(dataset['train_x'][:,[index]]) correct_face_2 = copy.deepcopy(dataset['test_x'][:,[corr_index]]) index = nn_classifier_index(lda.transform(correct_face_2),lda.transform(dataset['train_x'])) corr_face = copy.deepcopy(dataset['train_x'][:,[index]]) # plot both faces to compare plt.figure() f, ax = plt.subplots(2, 2, sharey=True) f.suptitle('PCA-LDA-NN wrong classification comparison') img = (correct_face).reshape((46,56)) img = np.rot90(img,3) ax[0,0].imshow(img, cmap="gray") ax[0,0].axis('off') ax[0,0].set_title('Input Face') img = (wrong_face).reshape((46,56)) img = np.rot90(img,3) ax[0,1].imshow(img, cmap="gray") ax[0,1].axis('off') ax[0,1].set_title('Wrong Prediction') img = (correct_face_2).reshape((46,56)) img = np.rot90(img,3) ax[1,0].imshow(img, cmap="gray") ax[1,0].axis('off') ax[1,0].set_title('Input Face') img = (corr_face).reshape((46,56)) img = np.rot90(img,3) ax[1,1].imshow(img, cmap="gray") ax[1,1].axis('off') ax[1,1].set_title('Correct Prediction') #plt.title('Comparison of reconstruction') plt.savefig("results/q3/wrong_pca_lda_nn_classifier.png", format="png", transparent=True) ''' ###################### # PCA-LDA EVALUATION # ###################### # Evaluate for different M_pca M_pca = np.arange(75,300,10) M_lda = np.arange(20,100,10) err_results = [ [] for m in M_lda ] lda_index = 0 for m_lda in M_lda: for m_pca in M_pca: if m_lda > m_pca: continue # Setup lda = LDA() lda.dataset = copy.deepcopy(dataset) lda.run_setup() # Set hyper parameters lda.M_pca = m_pca lda.M_lda = m_lda # Run lda.run_pca_lda() err,_ = lda.run_nn_classifier() print("M PCA: {}, M LDA: {}, ERROR: {}".format(m_pca,m_lda,err)) err_results[lda_index].append(err) lda_index += 1 fig = plt.figure() legends = [ '' for i in range(len(err_results)) ] for i in range(len(err_results)): legends[i], = plt.plot(M_pca,err_results[i],label='M lda = {}'.format(M_lda[i])) plt.legend(handles=legends) plt.show() ''' ''' ################### # PCA-LDA BAGGING # ################### # Number of machines NUM_MACHINES = 5 # Machine Parameters M_pca = 100 M_lda = 50 sample_size = 300 machine = [LDA() for i in range(NUM_MACHINES)] class_sizes = [] for i in range(NUM_MACHINES): # Randomly sample training data TODO try stratified and un-stratified sample_index = sample_rnd(dataset['train_y'],sample_size) #sample_index = sample_stratified(dataset['train_y'],sample_size) # assign dataset for machine machine[i].dataset['train_x'] = copy.deepcopy(dataset['train_x'][:,sample_index]) machine[i].dataset['train_y'] = copy.deepcopy(dataset['train_y'][sample_index]) machine[i].dataset['test_x'] = copy.deepcopy(dataset['test_x']) machine[i].dataset['test_y'] = copy.deepcopy(dataset['test_y']) # Setup each machine machine[i].run_setup() machine[i].M_pca = M_pca machine[i].M_lda = M_lda class_sizes.append(machine[i].get_class_sizes()) # variable to store label results labels = [[] for i in range(NUM_MACHINES)] for i in range(NUM_MACHINES): machine[i].run_pca_lda() _, labels[i] = machine[i].run_nn_classifier() # get committee machine output labels_out = committe_machine_majority_vote(labels) err = identity_error(labels_out,dataset['test_y']) print('error(majority voting): ',err) # get committee machine output labels_out = committe_machine_weighted_voting(labels,class_sizes) err = identity_error(labels_out,dataset['test_y']) print('error(weighted voting): ',err) # get committee machine output (average) labels_out = committe_machine_average(labels) err = identity_error(labels_out,dataset['test_y']) print('error(average): ',err) ''' ################################### # PCA-LDA PARAMETER RANDOMISATION # ################################### # Number of machines NUM_MACHINES = 15 # Machine Parameters M0 = 125 M1 = 25 #M_pca = 100 M_lda = 40 #sample_size = 5 machine = [LDA() for i in range(NUM_MACHINES)] for i in range(NUM_MACHINES): # Choose random eigenvectors for PCA M_pca = random_parameters(M0,M1,max_size=(len(dataset['train_y'])-1)) # assign dataset for machine machine[i].dataset['train_x'] = copy.deepcopy(dataset['train_x']) machine[i].dataset['train_y'] = copy.deepcopy(dataset['train_y']) machine[i].dataset['test_x'] = copy.deepcopy(dataset['test_x']) machine[i].dataset['test_y'] = copy.deepcopy(dataset['test_y']) # Setup each machine machine[i].run_setup() machine[i].M_pca = M_pca machine[i].M_lda = M_lda # variable to store label results labels = [[] for i in range(NUM_MACHINES)] for i in range(NUM_MACHINES): machine[i].run_pca_lda(m_pca_type=1) _, labels[i] = machine[i].run_nn_classifier() # get committee machine output labels_out = committe_machine_majority_vote(labels) err = identity_error(labels_out,dataset['test_y']) print('error(majority voting): ',err) # get committee machine output (average) labels_out = committe_machine_average(labels) err = identity_error(labels_out,dataset['test_y']) print('error(average): ',err) plot_confusion_matrix(dataset["test_y"], labels_out, "results/q3/lda_pca_ensemble_classifier_cm",normalize=True) ############################ # ENSEMBLE HYPERPARAMETERS # ############################ # Number of machines NUM_MACHINES = 50 # List of errors err = [ [0,0] for i in range(NUM_MACHINES) ] err = [ [0 for i in range(NUM_MACHINES) ], [0 for i in range(NUM_MACHINES) ] ] # HIGH CORRELATION # # Machine Parameters M0 = 125 M1 = 25 #M_pca = 100 M_lda = 40 #sample_size = 5 machine = [LDA() for i in range(NUM_MACHINES)] for i in range(NUM_MACHINES): # Choose random eigenvectors for PCA M_pca = random_parameters(M0,M1,max_size=(len(dataset['train_y'])-1)) # assign dataset for machine machine[i].dataset['train_x'] = copy.deepcopy(dataset['train_x']) machine[i].dataset['train_y'] = copy.deepcopy(dataset['train_y']) machine[i].dataset['test_x'] = copy.deepcopy(dataset['test_x']) machine[i].dataset['test_y'] = copy.deepcopy(dataset['test_y']) # Setup each machine machine[i].run_setup() machine[i].M_pca = M_pca machine[i].M_lda = M_lda # variable to store label results labels = [[] for i in range(NUM_MACHINES)] for i in range(NUM_MACHINES): machine[i].run_pca_lda(m_pca_type=1) _, labels[i] = machine[i].run_nn_classifier() # get committee machine output for i in range(NUM_MACHINES): labels_out = committe_machine_majority_vote(labels[:(i+1)]) err[0][i] = identity_error(labels_out,dataset['test_y']) # LOW CORRELATION # # Machine Parameters M0 = 25 M1 = 125 #M_pca = 100 M_lda = 40 #sample_size = 5 machine = [LDA() for i in range(NUM_MACHINES)] for i in range(NUM_MACHINES): # Choose random eigenvectors for PCA M_pca = random_parameters(M0,M1,max_size=(len(dataset['train_y'])-1)) # assign dataset for machine machine[i].dataset['train_x'] = copy.deepcopy(dataset['train_x']) machine[i].dataset['train_y'] = copy.deepcopy(dataset['train_y']) machine[i].dataset['test_x'] = copy.deepcopy(dataset['test_x']) machine[i].dataset['test_y'] = copy.deepcopy(dataset['test_y']) # Setup each machine machine[i].run_setup() machine[i].M_pca = M_pca machine[i].M_lda = M_lda # variable to store label results labels = [[] for i in range(NUM_MACHINES)] for i in range(NUM_MACHINES): machine[i].run_pca_lda(m_pca_type=1) _, labels[i] = machine[i].run_nn_classifier() # get committee machine output for i in range(NUM_MACHINES): labels_out = committe_machine_majority_vote(labels[:(i+1)]) err[1][i] = identity_error(labels_out,dataset['test_y']) plt.figure() plt.title('Comparison of Different Comittee Machines') plt.xlabel('Number of Machines') plt.ylabel('Error (%)') plt.plot(range(NUM_MACHINES),err[0], label="High Machine Correlation") plt.plot(range(NUM_MACHINES),err[1], label="Low Machine Correlation") plt.legend() plt.savefig('results/q3/num_machines_eval.png', format='png', transparent=True)
return vote_time.group() def parse_topic_from_html(tree): """ Parse and clear vote topic from html """ vote_topic = tree.xpath('//span[@class="rvts1"]/text()')[1] vote_topic = sub(r'[^{0}]'.format(UKR_ALPHABET), '', vote_topic.lower()) return sub(r' {2,}', ' ', vote_topic) def load_factions(filename='json/factions.json'): """ Load factions json as dictionary """ with open(filename, encoding=ENCODING) as file: factions = loads(file.read()) for key in factions: if factions[key][0][0] == '\ufeff': factions[key][0] = factions[key][0][1:] return factions if __name__ == '__main__': data_path = Path(__file__).absolute().ancestor(2).child('Data').child( 'html') # parse_vote_topics(data_path, first_of=100) parse_html(data_path, LDA(), first_of=10)
# In[ ]: #scikit-learn LDA implementation #201 #1121 #4617 #model=LatentDirichletAllocation(n_topics=num_topics,max_iter=100,learning_method='batch',random_state=201)#,doc_topic_prior=50.0/num_topics,topic_word_prior=200.0/num_topics) #model.fit(bag_of_words) # In[ ]: # In[ ]: #lda implementation from https://github.com/ariddell/lda using collapsed gibbs sampling model = LDA(n_topics=num_topics, n_iter=1000, random_state=201, refresh=100) model.fit(bag_of_words) # model.fit_transform(X) is also available #topic_word = model.topic_word_ # model.components_ also works # In[ ]: def print_top_words(model, feature_names, n_top_words): for topic_idx, topic in enumerate(model.components_): print("Topic #%d:" % topic_idx) print(" ".join( [feature_names[i] for i in topic.argsort()[:-n_top_words - 1:-1]])) # In[ ]:
X_train = eval("X_%s_train" % l) y_train = eval("y_%s_train" % l) X_test = eval("X_%s_test" % l) y_test = eval("y_%s_test" % l) X_combined = np.vstack((X_train, X_test)) y_combined = np.hstack((y_train, y_test)) fig, ax = plt.subplots(nrows=2, ncols=2, figsize=(16, 18)) ax = ax.flatten() print(l) """ Run LDA """ LDA_clf = LDA() LDA_clf.fit(X_train, y_train) lda_train_error = np.mean(LDA_clf.predict(X_train).flatten() != y_train) lda_test_error = np.mean(LDA_clf.predict(X_test).flatten() != y_test) plot_decision_regions(X=X_combined, y=y_combined, classifier=LDA_clf, test_idx=range(X_train.shape[0], X_train.shape[0] + X_test.shape[0]), ax=ax[0]) ax[0].set_xlabel("x1", fontsize="large") ax[0].set_ylabel("x2", fontsize="large") ax[0].legend(loc="upper right", fontsize="large") ax[0].set_title("Generative model (LDA) on dataset %s" % l,
print('generating training/testing corpus...') corpus = Corpus() corpus.generate_corpus_from_graph_using_SIP(graph, '012-SIP') train_corpus, test_corpus = corpus_split(corpus) # stochastic variational inference hyper_params_svb = {} hyper_params_svb['num_topics'] = K hyper_params_svb['alpha'] = alpha # uniform [1/K, ..., 1/K] hyper_params_svb['eta'] = eta # uniform [1/K, ..., 1/K] hyper_params_svb['size_vocab'] = graph.n hyper_params_svb['num_docs'] = train_corpus.num_docs hyper_params_svb['tau0'] = tau0 hyper_params_svb['kappa'] = kappa lda_svb = LDA(hyper_params_svb, 'SVB') log_file = open(f_log, "w") log_file.write("iteration time rthot held-out log-perplexity estimate\n") total_time = 0 D = train_corpus.num_docs max_iter_per_epoch = np.ceil(D / batchsize) print('stochastic variational inference...') for epoch in range(epochs): iter = 0 printProgress(iter, max_iter_per_epoch, prefix='epoch %s' % int(epoch + 1), suffix='complete', barLength=50)
def run(self, mode, cntStatus = True, saveVid = False, showVid = True ): lbp = lbp_feature() # neural_network = neural_net(75, 3) # neural_network.create_struct(150) # neural_network.load_model(settings.STATICFILES_DIRS[0]) lda = LDA(75, 3) #lda.create_struct(150) if mode == 'predict': lda.load_model(settings.STATICFILES_DIRS[0]) self.video.set(cv2.cv.CV_CAP_PROP_POS_MSEC, 0) kernel = np.ones((10, 10), np.uint8) lanes = [[] for x in range(self.totalLane)] totalCars = [0] * self.totalLane num_car_detect = 0 self.timer = threading.Timer(5.0, self.progress) self.timer.start() while self.video.isOpened(): ret, frame = self.video.read() if not ret: break frameOrigin = deepcopy(frame) res = frame self.num_frame +=1 for point in self.lanePoints: cv2.polylines(frame, [point], True, (0, 255, 0), 3) filteredFrame = cv2.GaussianBlur(frame, (5, 5), 0) if self.fgMask is None: self.fgMask = self.subtractor.apply(filteredFrame, -1) test = deepcopy(self.fgMask) self.fgMask = self.subtractor.apply(filteredFrame, self.fgMask, -1) self.fgMask = cv2.dilate(self.fgMask, kernel, iterations=1) self.fgMask = cv2.erode(self.fgMask, kernel, iterations=1) self.fgMask = cv2.morphologyEx(self.fgMask, cv2.MORPH_CLOSE, np.ones((30, 30), np.uint8)) self.fgMask = cv2.morphologyEx(self.fgMask, cv2.MORPH_CLOSE, np.ones((30, 30), np.uint8)) self.fgMask = cv2.morphologyEx(self.fgMask, cv2.MORPH_OPEN, np.ones((5, 5), np.uint8)) tempMask = deepcopy(self.fgMask) carImg = cv2.bitwise_and(frameOrigin, frameOrigin, mask=self.fgMask) # Section tracking and Detection contours, hrc = cv2.findContours(tempMask, cv2.RETR_EXTERNAL, cv2.CHAIN_APPROX_TC89_KCOS) isIn = [False] * self.totalLane laneObj = [[] for x in range(self.totalLane)] outLane = [[] for x in range(self.totalLane)] for obj in contours: moment = cv2.moments(obj) if moment['m00'] == 0: continue cx = int(moment['m10']/moment['m00']) cy = int(moment['m01']/moment['m00']) pX, pY, w, h = cv2.boundingRect(obj) isNotLane = True for numLane in range(len(self.laneContours)): if cv2.pointPolygonTest(self.laneContours[numLane][0], (cx, cy), False) == 1: car_object = {"centroid": (cx, cy+h/2), "origin": (pX, pY), "height": h, "width": w} laneObj[numLane].append(car_object) isNotLane = False break if isNotLane: for numLane in range(len(self.laneContours)): lanePoint = self.lanePoints[numLane] if cx >= lanePoint[3][0][0] and cx <= lanePoint[2][0][0]\ and cy >= lanePoint[3][0][1] and cy <= lanePoint[3][0][1]+50: car_object = {"centroid": (cx, cy+h/2), "origin": (pX, pY), "height": h, "width": w} outLane[numLane].append(car_object) for numLane in range(len(self.laneContours)): for i in outLane[numLane]: diffRange = 50 foundedObj = None for j in lanes[numLane]: diff = math.fabs(j["point"][0][0] - i["centroid"][0]) + math.fabs(j["point"][0][1] - i["centroid"][1]) if diff < diffRange: diffRange = diff foundedObj = j if foundedObj is not None: totalCars[numLane] += 1 originX = i["origin"][0] originY = i["origin"][1] crop_img = frameOrigin[originY:originY + i["height"], originX:originX+i["width"]] normal_image = cv2.resize(crop_img, (64, 64)) num_car_detect += 1 if mode == 'train': directory = settings.STATICFILES_DIRS[0]+'main_app/media/train_image/' if not os.path.exists(directory): os.makedirs(directory) cv2.imwrite(directory + 'car'+str(num_car_detect)+'.png', crop_img) if mode == 'predict': height, width, channels = crop_img.shape size_data = [height/100.0, width/100.0, height * width/10000.0] lbp.read_image(normal_image) feature = lbp.extract_feature(size_data[0], size_data[1], size_data[2]) #answer = neural_network.predict(feature) answer = int(lda.predict(feature)) save_type(self.video_name, answer, self.num_frame) if answer == 2: self.typeCar["small"] += 1 elif answer == 1: self.typeCar["medium"] += 1 else: self.typeCar["large"] += 1 print answer file_name = self.video_name[:self.video_name.find('.avi')] + '.png' path = settings.STATICFILES_DIRS[0]+'main_app/media/result_image/'+str(num_car_detect)+'-'+str(answer)+'-'+file_name cv2.imwrite(path, crop_img) lanes[numLane].remove(foundedObj) for i in lanes[numLane]: i["stat"] = False for i in laneObj[numLane]: diffRange = 50 foundedObj = None for j in lanes[numLane]: diff = math.fabs(j["point"][0][0] - i["centroid"][0]) + math.fabs(j["point"][0][1] - i["centroid"][1]) if diff < diffRange: diffRange = diff foundedObj = j if foundedObj is not None: foundedObj["point"].insert(0, i["centroid"]) foundedObj["stat"] = True else: lanes[numLane].append({ "point": [i["centroid"]], "stat": True }) tempLane = [] for i in lanes[numLane]: if i["stat"]: tempLane.append(i) cv2.polylines(res, np.int32([i["point"]]), False, (0, 255, 255), 3) lanes[numLane] = tempLane # Section Draw TrackLine for obj in contours: moment = cv2.moments(obj) if moment['m00'] == 0: continue pX, pY, w, h = cv2.boundingRect(obj) cx = int(moment['m10']/moment['m00']) cy = int(moment['m01']/moment['m00'])+h/2 cv2.circle(res, (cx, cy), 3, (0, 0, 255), 4) distance = [] for numLane in range(len(self.laneContours)): distance.append(cv2.pointPolygonTest(self.laneContours[numLane][0], (cx, cy), False)) for numLane in range(len(self.laneContours)): if distance[numLane] == 1: isIn[numLane] = True cv2.rectangle(res, (pX, pY), (pX+w, pY+h), (0, 255, 255), 2) if self.lanes[numLane]["is_empty"]: self.lanes[numLane]["is_empty"] = False self.lanes[numLane]["pts"].append((cx, cy)) else: self.lanes[numLane]["pts"].insert(0, (cx, cy)) break else: cv2.rectangle(res, (pX, pY), (pX+w, pY+h), (255, 255, 0), 2) for i in range(0, self.totalLane): if isIn[i]: if showVid: pass else: self.lanes[numLane]["is_empty"] = True self.lanes[numLane]["pts"] = [] if cntStatus: cv2.putText(res, 'lane1: '+str(totalCars[0]), (10, 30), cv2.FONT_HERSHEY_SIMPLEX, 1, (0, 255, 0), 2) cv2.putText(res, 'lane2: '+str(totalCars[1]), (10, 70), cv2.FONT_HERSHEY_SIMPLEX, 1, (125, 0, 255), 2) cv2.putText(res, 'truck/bus: '+str(self.typeCar["large"]), (400, 30), cv2.FONT_HERSHEY_SIMPLEX, 1, (255, 255, 255), 2) cv2.putText(res, 'small car: '+str(self.typeCar["medium"]), (400, 70), cv2.FONT_HERSHEY_SIMPLEX, 1, (255, 255, 255), 2) cv2.putText(res, 'motorcycle: '+str(self.typeCar["small"]), (400, 110), cv2.FONT_HERSHEY_SIMPLEX, 1, (255, 255, 255), 2) if showVid: resMask = cv2.bitwise_and(frame, frame, mask=~self.fgMask) cv2.imshow('frame', res) if cv2.waitKey(5) & 0xFF == ord('q'): cv2.imwrite('tesf.png', frameOrigin) cv2.imwrite('tesM.png', self.fgMask) break self.timer.cancel() update_progress(self.video_name, self.num_frame, self.total_frame) print totalCars self.video.release() cv2.destroyAllWindows() print self.typeCar
#%% load print('input tokens from preprocessing pipeline %d' % toks) print('loading DTM from `%s`...' % DATA_PICKLE_DTM) doc_labels, vocab, dtm, tokens = unpickle_file(DATA_PICKLE_DTM) assert len(doc_labels) == dtm.shape[0] assert len(vocab) == dtm.shape[1] print('loaded DTM with %d documents, %d vocab size, %d tokens' % (len(doc_labels), len(vocab), dtm.sum())) #%% compute model print('generating model with parameters:') pprint(LDA_PARAMS) model = LDA(**LDA_PARAMS) model.fit(dtm) #%% output print('saving model to `%s`' % LDA_MODEL_PICKLE) pickle_data((doc_labels, vocab, dtm, model), LDA_MODEL_PICKLE) print('saving results to `%s`' % LDA_MODEL_EXCEL_OUTPUT) save_ldamodel_summary_to_excel(LDA_MODEL_EXCEL_OUTPUT, model.topic_word_, model.doc_topic_, doc_labels, vocab, dtm=dtm)
def fit_model(self, data, params): lda_instance = LDA(**params) lda_instance.fit(data) return lda_instance
def index(): core = TermiteCore(request, response) lda = LDA(request) return core.GenerateResponse(lda.params)
from sklearn import datasets import matplotlib.pyplot as plt import numpy as np from lda import LDA data = datasets.load_iris() X = data.data y = data.target lda = LDA(n_components=2) lda.fit(X, y) X_projected = lda.transform(X) print("shape of X:", X.shape) print("shape of transform X ", X_projected.shape) x1 = X_projected[:, 0] x2 = X_projected[:, 1] plt.scatter(x1, x2, c=y, edgecolors='none', alpha=0.8, cmap=plt.cm.get_cmap('viridis', 3)) plt.colorbar() plt.show()
def TopicCooccurrence(): core = TermiteCore(request, response) lda = LDA(request) topicCooccurrence = lda.GetTopicCooccurrence() return core.GenerateResponse(lda.params, {'TopicCooccurrence': topicCooccurrence})
def load_wakati_docs(filename): # 1行1文書の分かち書き済みテキストファイル texts = [] for line in open(filename, 'r'): texts.append(line.split(' ')) return texts def create_dict_and_corpus(texts, no_below=10, no_above=0.2): # 辞書の作成 dictionary = corpora.Dictionary(texts) dictionary.filter_extremes(no_below=no_below, no_above=no_above) dictionary.save_as_text('./docs.dic') # コーパス作成 corpus = [dictionary.doc2bow(text) for text in texts] corpora.MmCorpus.serialize('./corpus.mm', corpus) return dictionary, corpus if __name__ == "__main__": import sys from lda import LDA import time texts = load_wakati_docs(sys.argv[1]) dictionary, corpus = create_dict_and_corpus(texts) t0 = time.time() lda = LDA(corpus, dictionary, num_topic=10, iterations=1000) t1 = time.time() lda.show_topics() print("Elapsed time: {}".format(t1 - t0))
from lda import LDA train_corpus = 'data/worldnews_train.csv' test_corpus = 'data/worldnews_test.csv' alpha = 0.01 beta = 0.01 topics = 5 model = LDA(topics, alpha, beta) model.fit(train_corpus, n_iters=10000, burn=8000) model.print_topics() x = input('Press key to start evaluation') model.predict(test_corpus, n_iters=1000, burn=300) model.print_eval_results()