def main():
    n_samples = 2000
    n_features = 1000
    n_topics = 20
    n_top_words = 15

    dataset = fetch_20newsgroups(shuffle=True,
                                 random_state=1,
                                 remove=('headers', 'footers', 'quotes'))

    vectorizer = CountVectorizer(token_pattern=r"(?u)\b[^\d\W]\w+\b",
                                 max_df=0.9,
                                 max_features=n_features,
                                 min_df=2,
                                 stop_words='english')

    doc_word_count = vectorizer.fit_transform(dataset.data[:n_samples])
    lda = LDA(n_topics=n_topics,
              kappa=0.7,
              tau0=1024.,
              n_jobs=4,
              random_state=0)

    feature_names = vectorizer.get_feature_names()
    start_time = time.clock()
    lda.fit(doc_word_count)
    end_time = time.clock()
    # print feature_names[:10]
    for topic_idx, topic in enumerate(lda.components_):
        print("Topic #%d:" % topic_idx)
        print(" ".join(
            [feature_names[i] for i in topic.argsort()[:-n_top_words - 1:-1]]))

    print 'run time =  %.3f seconds' % (end_time - start_time)
Beispiel #2
0
def main():
    n_samples = 2000
    n_features = 1000
    n_topics = 20
    n_top_words = 15

    dataset = fetch_20newsgroups(
        shuffle=True, random_state=1, remove=('headers', 'footers', 'quotes'))

    vectorizer = CountVectorizer(token_pattern=r"(?u)\b[^\d\W]\w+\b",
                                 max_df=0.9, max_features=n_features, min_df=2, stop_words='english')

    doc_word_count = vectorizer.fit_transform(dataset.data[:n_samples])
    lda = LDA(n_topics=n_topics, kappa=0.7, tau0=1024., n_jobs=4, random_state=0)
    
    feature_names = vectorizer.get_feature_names()
    start_time = time.clock()
    lda.fit(doc_word_count)
    end_time = time.clock()
    # print feature_names[:10]
    for topic_idx, topic in enumerate(lda.components_):
        print("Topic #%d:" % topic_idx)
        print(" ".join([feature_names[i]
                        for i in topic.argsort()[:-n_top_words - 1:-1]]))

    print 'run time =  %.3f seconds' % (end_time - start_time)
Beispiel #3
0
def fit_reuters():
    corpus = Corpus()
    corpus.load_ldac(menu_path + 'reuters.ldac')
    model = LDA(n_topic=20)
    model.fit(corpus, n_iter=50)

    model.save_model(protocol=2)
Beispiel #4
0
def main():
    corpus = Corpus()
    corpus.load_ldac(menu_path + 'reuters.ldac')
    model = LDA(n_topic=20)
    model.fit(corpus, valid_split=0.1, n_iter=10)

    perplexity = model.perplexity(corpus.docs)
    print perplexity
Beispiel #5
0
class TopicModelingLDA(object):
	#wrapper de la libreriar LDA
	#permite caracterizar los topicos en base a varios scores encontrados en la literatura

	def __init__(self,corpus,metrics_criteria='simple'):
		super(TopicModelingLDA, self).__init__()
		self.corpus = corpus
		self.select_metric_criteria(metrics_criteria)
		self.model = None
		self.topic_words = None
		self.top_words = None
		self.all_words = []

	def fit(self,num_topic=5,n_iter=1500):
		count_vect = CountVectorizer()
		x_train_counts = count_vect.fit_transform(self.corpus)
		self.model = LDA(n_topics=num_topic, n_iter=n_iter, random_state=1)
		self.model.fit(x_train_counts)

		self.topic_words = self.model.topic_word_
		self.vocabulary = count_vect.get_feature_names()

	def select_metric_criteria(self,metrics_criteria):
		if metrics_criteria == 'term_score':
			self.metrics = TopicTermScore()
		else:
			self.metrics = TopicSimpleScore()

	def get_highest_scores(self,k_top=10):
		#topic_words es una matriz (numero de topicos,palabras)
		#la fila k indica la distribucion de palabras del topico k
		num_topics = len(self.topic_words) 
		print ("Numero de topicos",num_topics)
		top_words = []
		self.top_words = {}

		for topic_k in range(num_topics):
			scores = []
			for v,word in enumerate(self.vocabulary):
				score = self.metrics.calculate(self.topic_words,topic_k,v)
				scores.append((word,score))
			scores.sort(key=lambda tup: tup[1]) 
			scores = scores[-k_top:]
			
			print ("Topico %d"%(topic_k))
			for word,score in scores:
				print ("%s,%.4f"%(word,score))
			print ("")

			self.top_words[topic_k] = [{'word':word,'score':score} for word,score in scores]
			self.all_words += [ word for word,score in scores]

		return self.top_words

	def get_all_words(self):
		return self.all_words
    def generate_topics(self):
        file_to_tokens = self._get_normalized_corpus(self.files)

        np_matrix = self._get_document_term_matrix(file_to_tokens)
        model = LDA(n_topics=self.n_topics,
                    n_iter=self.n_iter,
                    random_state=self.random_state)
        model.fit(np_matrix)

        self._lda_model = model
def _getLDA(text, label, n_topic_words):
    vectorizer = CountVectorizer(min_df=100, max_df=5000)
    transformer = TfidfTransformer()
    df = vectorizer.fit_transform(text)
    tfidf_word_name = vectorizer.get_feature_names()

    model = LDA(n_topics=20, n_iter=1000, random_state=1)
    model.fit(df)
    Dump(model, 'LDA_model', 'joblib')
    topic_word = model.topic_word_
    doc_topic = model.doc_topic_
    with open('topic_word.txt', 'w') as f:
        n_top_words = 300
        for i, topic_dist in enumerate(topic_word):
            topic_words = np.array(tfidf_word_name)[np.argsort(
                topic_dist)][:-(n_top_words + 1):-1]
            f.write('Topic {}: {}'.format(i, ' '.join(topic_words)) + '\n')
    return topic_word, doc_topic
Beispiel #8
0
def exampleLDAExecution():
    X = data.load_reuters()
    vocab = data.load_reuters_vocab()
    titles = data.load_reuters_titles()

    # document-term matrix
    X = data.load_reuters()
    print("type(X): {}".format(type(X)))
    print("shape: {}\n".format(X.shape))

    # the vocab
    vocab = data.load_reuters_vocab()
    print("type(vocab): {}".format(type(vocab)))
    print("len(vocab): {}\n".format(len(vocab)))

    # titles for each story
    titles = data.load_reuters_titles()
    print("type(titles): {}".format(type(titles)))
    print("len(titles): {}\n".format(len(titles)))

    doc_id = 0
    word_id = 3117

    print("doc id: {} word id: {}".format(doc_id, word_id))
    print("-- count: {}".format(X[doc_id, word_id]))
    print("-- word : {}".format(vocab[word_id]))
    print("-- doc  : {}".format(titles[doc_id]))

    model = LDA(n_topics=20, n_iter=500, random_state=1)
    model.fit(X)  # model.fit_transform(X) is also available

    topic_word = model.topic_word_  # model.components_ also works
    n_top_words = 10
    for i, topic_dist in enumerate(topic_word):
        topic_words = np.array(vocab)[np.argsort(topic_dist)][:-n_top_words:-1]
        print('Topic {}: {}'.format(i, ' '.join(topic_words)))

    doc_topic = model.doc_topic_
    for i in range(10):
        print("{} (top topic: {})".format(titles[i], doc_topic[i].argmax()))
def RunLDA(FileLocation, NumDocs, NumTopics):
    # In order to create a Term Document matrix,
    # We read in every file and then make a list containing the body of 
    # all of the articles
    fin=open(FileLocation,'r')
    #Will need to store the urls when we make the tdm
    UrlArray = []
    #Create TDM object. It will also remove stopwords
    TDM = TermDocumentMatrix(simple_tokenize_remove_stopwords)
    # Add each article to the TDM object. Also create a list of urls
    # This is a massive corpus, so we are only doing this for 300 articles.
    for i in range(NumDocs):
        Article = fin.next()
        UrlArray.append(re.split(r'\t',Article)[0])
        TDM.add_doc(re.split(r'\t',Article)[1])
    # Rows in TDM is an iterable 
    # We can't have that to input it into numpy
    X = list(TDM.rows())
    # Oddly enough the first row of the .rows() iterable in TDM returms a 
    # List of all of the words used. Think of it as a header file
    Vocab = X[0]
    Y = []
    #creating a 2d list containing the rows of the document matrix
    for i in range(len(X)-1):
        Y.append(X[i+1])
    # Create the LDA model object. 20 topics this time, but that can be changed. 
    model = LDA(n_topics=20, n_iter=1500, random_state=1)
    # Make a numpy Array to use as input
    Yarray = np.asarray(Y)
    #Fit the model. This process is similiar to scikit-learn's algorithms
    model.fit(Yarray)
    TopicWords = []
    topic_word = model.topic_word_
    n_top_words = 50
    for i, topic_dist in enumerate(topic_word):
        topic_words = np.array(Vocab)[np.argsort(topic_dist)][:-n_top_words:-1]
        TopicWords.append(topic_words)
        print('Topic {}: {}'.format(i, ' '.join(topic_words)))    
Beispiel #10
0
def appDescriptionsLDA():
    X = data.load_reuters()
    vocab = data.load_reuters_vocab()
    titles = data.load_reuters_titles()

    print X
    print vocab
    print titles

    X.shape
    X.sum()
    model = LDA(n_topics=20, n_iter=500, random_state=1)
    model.fit(X)  # model.fit_transform(X) is also available

    topic_word = model.topic_word_  # model.components_ also works
    n_top_words = 10
    for i, topic_dist in enumerate(topic_word):
        topic_words = np.array(vocab)[np.argsort(topic_dist)][:-n_top_words:-1]
        print('Topic {}: {}'.format(i, ' '.join(topic_words)))

    doc_topic = model.doc_topic_
    for i in range(10):
        print("{} (top topic: {})".format(titles[i], doc_topic[i].argmax()))
Beispiel #11
0
from data import Data
from lda import LDA

data = Data()

data.load()
data.textPre('r')
tf = data.saveModel('r')

model = LDA()
model.fit(tf)
#model.print_top_words(data.tf_vectorizer.get_feature_names())

Beispiel #12
0
    def fit_model(self, data, params):
        lda_instance = LDA(**params)
        lda_instance.fit(data)

        return lda_instance
import matplotlib.pyplot as plt
import numpy as np
from lda import LDA
from sklearn.model_selection import train_test_split
from sklearn.datasets import fetch_openml

if __name__ == '__main__':
    x, y = fetch_openml('mnist_784', version=1, return_X_y=True)
    y = y.astype(int)
    x_train, x_test, y_train, y_test = train_test_split(x, y)

    lda = LDA()

    lda.fit(x_train, y_train)
    train_acc = (lda.predict(x_train).argmax(1)
                 == y_train.squeeze()).mean() * 100
    test_acc = (lda.predict(x_test).argmax(1) == y_test.squeeze()).mean() * 100

    print(f'Train accuracy : {train_acc}%')
    print(f'Test accuracy : {test_acc}%')

    # plot generated
    images = []

    for i in range(10):  # each class
        temp = []
        for j in range(10):  # 10 samples
            temp.append(gda.generate(i).reshape(28, 28))
        images.append(temp)

    images = np.array(images)
Beispiel #14
0
plt.scatter(x1,
            x2,
            c=y,
            edgecolor='none',
            alpha=0.8,
            cmap=plt.cm.get_cmap('viridis', 3))

plt.xlabel('Temel Bileşen 1')
plt.ylabel('Temel Bileşen 2')
plt.colorbar()
plt.show()

# Verileri iki doğrusal diskriminant ile gösterme
from lda import LDA
lda = LDA(2)
lda.fit(X_min_max, y)
X_projected = lda.transform(X_min_max)

print('Min-Max Normalizasyonlu X:', X_min_max.shape)  # (150, 4)
print('LDA Uygulanan X:', X_projected.shape)  # (150, 2)

x1 = X_projected[:, 0]
x2 = X_projected[:, 1]

plt.scatter(x1,
            x2,
            c=y,
            edgecolor='none',
            alpha=0.8,
            cmap=plt.cm.get_cmap('viridis', 3))
Beispiel #15
0
    y_train = eval("y_%s_train" % l)
    X_test = eval("X_%s_test" % l)
    y_test = eval("y_%s_test" % l)

    X_combined = np.vstack((X_train, X_test))
    y_combined = np.hstack((y_train, y_test))

    fig, ax = plt.subplots(nrows=2, ncols=2, figsize=(16, 18))
    ax = ax.flatten()

    print(l)
    """
    Run LDA
    """
    LDA_clf = LDA()
    LDA_clf.fit(X_train, y_train)

    lda_train_error = np.mean(LDA_clf.predict(X_train).flatten() != y_train)
    lda_test_error = np.mean(LDA_clf.predict(X_test).flatten() != y_test)

    plot_decision_regions(X=X_combined,
                          y=y_combined,
                          classifier=LDA_clf,
                          test_idx=range(X_train.shape[0],
                                         X_train.shape[0] + X_test.shape[0]),
                          ax=ax[0])
    ax[0].set_xlabel("x1", fontsize="large")
    ax[0].set_ylabel("x2", fontsize="large")
    ax[0].legend(loc="upper right", fontsize="large")
    ax[0].set_title("Generative model (LDA) on dataset %s" % l,
                    fontsize="x-large",
Beispiel #16
0
import pickle

from lda import LDA

from data.datafile import AADataFile
dfile = pickle.load(open("data/datafile.pkl"))

dt = dfile.DT
te = dfile.TE


lda = LDA(K=10, n_jobs=8, nr_em_epochs=20)

perp, b, g = lda.fit(dt)
Beispiel #17
0
from lda import LDA

train_corpus = 'data/worldnews_train.csv'
test_corpus = 'data/worldnews_test.csv'

alpha = 0.01
beta = 0.01
topics = 5

model = LDA(topics, alpha, beta)
model.fit(train_corpus, n_iters=10000, burn=8000)

model.print_topics()

x = input('Press key to start evaluation')

model.predict(test_corpus, n_iters=1000, burn=300)

model.print_eval_results()
Beispiel #18
0
from lda import LDA, LdaType
import pandas as pd

data = pd.read_csv("./data/fisher.csv")

lda = LDA()
lda.fit(data=data, target_column_name='target')
conversion_data = lda.conversion(LdaType.Two)
print(conversion_data)
Beispiel #19
0
# In[ ]:

#scikit-learn LDA implementation
#201
#1121
#4617
#model=LatentDirichletAllocation(n_topics=num_topics,max_iter=100,learning_method='batch',random_state=201)#,doc_topic_prior=50.0/num_topics,topic_word_prior=200.0/num_topics)
#model.fit(bag_of_words)

# In[ ]:

# In[ ]:

#lda implementation from https://github.com/ariddell/lda using collapsed gibbs sampling
model = LDA(n_topics=num_topics, n_iter=1000, random_state=201, refresh=100)
model.fit(bag_of_words)  # model.fit_transform(X) is also available
#topic_word = model.topic_word_  # model.components_ also works

# In[ ]:


def print_top_words(model, feature_names, n_top_words):
    for topic_idx, topic in enumerate(model.components_):
        print("Topic #%d:" % topic_idx)
        print(" ".join(
            [feature_names[i] for i in topic.argsort()[:-n_top_words - 1:-1]]))


# In[ ]:

feature_names = vectorizer.get_feature_names()
Beispiel #20
0
    elif num_args == 3:
        params = {'idx_dataset': sys.argv[1], 'train': sys.argv[2] == 'True'}
    elif num_args == 4:
        params = {
            'idx_dataset': sys.argv[1],
            'train': sys.argv[2] == 'True',
            'test': sys.argv[3] == 'True'
        }

    # load dataset
    X_train, y_train, X_test, y_test = read_data(**params)

    if X_train is not None:
        # Linear Discriminant Analysis (LDA)
        lda = LDA()
        lda.fit(X_train, y_train)
        plot_results(lda, params['idx_dataset'], X_train, y_train, X_test,
                     y_test)
        print(
            'The accuracy on train (test) dataset {} for LDA: {} ({})'.format(
                params['idx_dataset'], lda.score(X_train, y_train),
                lda.score(X_test, y_test)))

        # Logistic regression
        log_reg = LogisticRegression()
        log_reg.fit(X_train, y_train)
        plot_results(log_reg, params['idx_dataset'], X_train, y_train, X_test,
                     y_test)
        print('The accuracy on train (test) dataset {} for LogReg: {} ({})'.
              format(params['idx_dataset'], log_reg.score(X_train, y_train),
                     log_reg.score(X_test, y_test)))
Beispiel #21
0
    utt2class = {}
    for l in lines:
        utt2class[l.split()[0]] = l.split()[1]
    labelled_egs = set(utt2class.keys())

    mean = np.load(args.mean)

    print('INFO::  Data loaded. Normalizing...')

    reader = script_reader(args.scp)
    X = []
    Y = []
    for i, data in enumerate(reader):
        utt, xvec = data
        if utt not in utt2class.keys():
            continue
        X.append(xvec - mean)
        Y.append(label2idx[utt2class[utt]])

    print('INFO::  Normalized ' + str(len(Y)) + ' utterences. Now computing LDA...')

    from lda import LDA

    lda = LDA()
    lda.fit(np.array(X), np.array(Y))

    print('INFO::  LDA done. Saving Model...')

    with open(args.out + '/lda.pkl', 'wb') as output:
        pickle.dump(lda, output, pickle.HIGHEST_PROTOCOL)
Beispiel #22
0
    def fit_model(self, data, params):
        from lda import LDA
        lda_instance = LDA(**params)
        lda_instance.fit(data)

        return lda_instance
Beispiel #23
0
from lda import LDA, _doc_update, _slice_doc_update
import pickle
import numpy as np

np.seterr(divide="raise")

from data.datafile import AADataFile
dfile = pickle.load(open("data/datafile.pkl"))

dt = dfile.DT
te = dfile.TE

f = te.toarray().argmax(axis=1)

lda = LDA(K=10, n_jobs=8, nr_em_epochs=20)

perp, b, g, e = lda.fit(dt, f)
Beispiel #24
0
from sklearn import datasets
import matplotlib.pyplot as plt
import numpy as np

from lda import LDA
data = datasets.load_iris()
X = data.data
y = data.target

lda = LDA(n_components=2)
lda.fit(X, y)

X_projected = lda.transform(X)
print("shape of X:", X.shape)
print("shape of transform X ", X_projected.shape)

x1 = X_projected[:, 0]
x2 = X_projected[:, 1]

plt.scatter(x1,
            x2,
            c=y,
            edgecolors='none',
            alpha=0.8,
            cmap=plt.cm.get_cmap('viridis', 3))

plt.colorbar()
plt.show()
Beispiel #25
0
    def clustering_measure(self, n_cluster):
        km = KMeans(n_cluster)
        km.fit(self.doc_features)
        print("Adjusted Rand-Index: %.3f"
              % metrics.adjusted_rand_score(self.doc_class, km.labels_))

    def cross_validation(self):
        X_train, X_test, y_train, y_test = cross_validation.train_test_split(
            self.doc_features, self.doc_class, test_size=0.4, random_state=0)
        clf = svm.SVC(kernel='linear', C=1).fit(X_train, y_train)
        print ("Cross-Validation Score: %.3f" % clf.score(X_test, y_test))


if __name__ == '__main__':
    # load dataset
    dataset = CNN()
    dataset.load_data('/home/yi/Dropbox/workspace/data/cnn/')

    # train lda
    lda = LDA(5)
    lda.initialize(dataset.data_matrix)
    #lda.load_label('labels.txt', dataset.dictionary)
    for iter in range(20):
        lda.fit(dataset.data_matrix)
    lda.fininsh()
    lda.print_top_words(dataset.dictionary, 10)

    # evaluate lda
    eval = Evaluator(dataset, lda)
    eval.clustering_measure(n_cluster=5)
    eval.cross_validation()
Beispiel #26
0
print('input tokens from preprocessing pipeline %d' % toks)

print('loading DTM from `%s`...' % DATA_PICKLE_DTM)
doc_labels, vocab, dtm, tokens = unpickle_file(DATA_PICKLE_DTM)
assert len(doc_labels) == dtm.shape[0]
assert len(vocab) == dtm.shape[1]
print('loaded DTM with %d documents, %d vocab size, %d tokens' %
      (len(doc_labels), len(vocab), dtm.sum()))

#%% compute model

print('generating model with parameters:')
pprint(LDA_PARAMS)

model = LDA(**LDA_PARAMS)
model.fit(dtm)

#%% output

print('saving model to `%s`' % LDA_MODEL_PICKLE)
pickle_data((doc_labels, vocab, dtm, model), LDA_MODEL_PICKLE)

print('saving results to `%s`' % LDA_MODEL_EXCEL_OUTPUT)
save_ldamodel_summary_to_excel(LDA_MODEL_EXCEL_OUTPUT,
                               model.topic_word_,
                               model.doc_topic_,
                               doc_labels,
                               vocab,
                               dtm=dtm)

#%%
Beispiel #27
0
from lda import LDA
from dataset import TwentyNewsDataset
import time
dataset = TwentyNewsDataset()
dataset.load_data()
n_topics = 20

lda = LDA(n_topics)
lda.initialize(dataset.data_matrix)
lda.load_label('labels.txt', dataset.dictionary)
print(lda.print_labels())

for _ in range(100):
    lda.fit()

lda.get_topic_word()
lda.get_doc_topic()
lda.print_top_words(dataset.dictionary, 10)
Beispiel #28
0
train_re_path = '../data/train/relevant.txt'
train_ir_path = '../data/train/irrelevant.txt'
test2_ir_path = '../data/test2/irrelevant.txt'
test2_re_path = '../data/test2/relevant.txt'
test1_ir_path = '../data/test1/irrelevant.txt'
test1_re_path = '../data/test1/relevant.txt'


words_dict, idx_dict = create_dict(full_path, stop_words)

train_X = load_data(train_path)
train_X = word_to_idx(train_X, words_dict)

lda = LDA(5)

lda.fit(train_X, words_dict.items())

test1_re_X = load_data(test1_re_path)
test1_re_X = word_to_idx(test1_re_X, words_dict)
test1_ir_X = load_data(test1_ir_path)
test1_ir_X = word_to_idx(test1_ir_X, words_dict)

test2_re_X = load_data(test2_re_path)
test2_re_X = word_to_idx(test2_re_X, words_dict)
test2_ir_X = load_data(test2_ir_path)
test2_ir_X = word_to_idx(test2_ir_X, words_dict)

target_X = load_data(target_path)
target_X = word_to_idx(target_X, words_dict)

train_re_X = load_data(train_re_path)
Beispiel #29
0
    n = 10000  # データ数
    data1 = Data2D(mu1, cov, n)
    data2 = Data2D(mu2, cov, n)
    X1 = data1.get_data()
    X2 = data2.get_data()
    X = np.vstack([X1, X2])

    # PCA
    pca = PCA()
    pca.fit(X)
    pca_vec = pca.get_vec()
    show_hist(project(X1, pca_vec), project(X2, pca_vec))

    # LDA
    lda = LDA()
    lda.fit(X1, X2)
    lda_vec = lda.get_vec()
    show_hist(project(X1, lda_vec), project(X2, lda_vec))

    # グラフ描画
    # 背景を白にする
    plt.figure(facecolor="w")

    axis_x = np.linspace(-10, 10)
    pca_y = (pca_vec[1] / pca_vec[0]) * axis_x
    lda_y = (lda_vec[0] / lda_vec[1]) * axis_x
    plt.plot(axis_x, pca_y, "c-", label="PCA")
    plt.plot(axis_x, lda_y, "m-", label="LDA")

    # 散布図をプロットする
    plt.scatter(data1.x, data1.y, color='r', marker='x')