import numpy as np from numpy import matlib import time import pprint import sklearn t0 = time.time() k = 100 alpha = .1 beta = .01 sampleFile = 'ap.txt' #sampleFile = 'ap2.text' sample_vocab = 'vocab.txt' corpus = Corpus() corpus.num_docs = corpus.getNumOfSampleDocs(sampleFile) corpus.vocab, corpus.num_terms = corpus.getSampleVocab(sample_vocab) corpus.readDocsSample(sampleFile) #document-topic DTMatrix = matlib.zeros((corpus.num_docs, k), dtype='float_') #term,topic matrix TTMatrix = matlib.zeros((corpus.num_terms, k), dtype='float_') vocab_index_dict = { word.rstrip(): index for index, word in enumerate(corpus.vocab) } vocab_word_dict = {index: word for word, index in vocab_index_dict.iteritems()} DocVocab = {} #doc is an int for doc_num, words in corpus.docs.iteritems():
from numpy import matlib import time import pprint import sklearn t0 = time.time() k=100 alpha = .1 beta = .01 sampleFile = 'ap.txt' #sampleFile = 'ap2.text' sample_vocab = 'vocab.txt' corpus = Corpus() corpus.num_docs = corpus.getNumOfSampleDocs(sampleFile) corpus.vocab,corpus.num_terms = corpus.getSampleVocab(sample_vocab) corpus.readDocsSample(sampleFile) #document-topic DTMatrix = matlib.zeros((corpus.num_docs,k),dtype='float_') #term,topic matrix TTMatrix =matlib.zeros((corpus.num_terms,k),dtype='float_') vocab_index_dict = {word.rstrip():index for index, word in enumerate(corpus.vocab)} vocab_word_dict = {index:word for word,index in vocab_index_dict.iteritems()} DocVocab = {} #doc is an int for doc_num,words in corpus.docs.iteritems(): #sample topic index for word #each word in a document gets assigned a topic #words = set(vocab_index_dict.keys()) & set(words)