コード例 #1
0
ファイル: LDA.py プロジェクト: jsuit/hack-night-1-clusterer
import numpy as np
from numpy import matlib
import time
import pprint
import sklearn

t0 = time.time()
k = 100
alpha = .1
beta = .01
sampleFile = 'ap.txt'
#sampleFile = 'ap2.text'
sample_vocab = 'vocab.txt'
corpus = Corpus()

corpus.num_docs = corpus.getNumOfSampleDocs(sampleFile)
corpus.vocab, corpus.num_terms = corpus.getSampleVocab(sample_vocab)
corpus.readDocsSample(sampleFile)
#document-topic
DTMatrix = matlib.zeros((corpus.num_docs, k), dtype='float_')
#term,topic matrix
TTMatrix = matlib.zeros((corpus.num_terms, k), dtype='float_')
vocab_index_dict = {
    word.rstrip(): index
    for index, word in enumerate(corpus.vocab)
}
vocab_word_dict = {index: word for word, index in vocab_index_dict.iteritems()}
DocVocab = {}
#doc is an int

for doc_num, words in corpus.docs.iteritems():
コード例 #2
0
ファイル: LDA.py プロジェクト: jsuit/hack-night-1-clusterer
from numpy import matlib
import time
import pprint
import sklearn

t0 = time.time()
k=100
alpha = .1
beta = .01
sampleFile = 'ap.txt'
#sampleFile = 'ap2.text'
sample_vocab = 'vocab.txt'
corpus = Corpus()


corpus.num_docs = corpus.getNumOfSampleDocs(sampleFile)
corpus.vocab,corpus.num_terms = corpus.getSampleVocab(sample_vocab)
corpus.readDocsSample(sampleFile)
#document-topic
DTMatrix = matlib.zeros((corpus.num_docs,k),dtype='float_')
#term,topic matrix
TTMatrix =matlib.zeros((corpus.num_terms,k),dtype='float_')
vocab_index_dict = {word.rstrip():index for index, word in enumerate(corpus.vocab)}
vocab_word_dict = {index:word for word,index in vocab_index_dict.iteritems()}
DocVocab = {}
#doc is an int

for doc_num,words in corpus.docs.iteritems():
        #sample topic index for word
        #each word in a document gets assigned a topic
        #words = set(vocab_index_dict.keys()) & set(words)