def __init__(self,batchsize,d,k,tau,kappa): self.__dp=dataParse.dataParse(os.path.abspath("./data/ideas.txt")) self.__result=self.__dp.concatedField(os.path.abspath("./data/fieldList.txt")) #doc self.__doc=self.__result[0] #fielddata self.__fid=self.__result[1] #dictionary self.__vocab=file(os.path.abspath('./data/vocabulary.txt')).readlines() # the number of words in the dictionary self.__W=len(self.__vocab) #the number of documents to analyze in each iteration self.__batchsize=batchsize # the total number of documents self.__D=d # the number of topics self.__K=k # the number of iterations self.__documentstoanalyze=self.__D/self.__batchsize # tau self.__tau=tau # kappa self.__kappa=kappa # lda instance (alpha=1/K, eta=1/K, tau_0=1024, kappa=0.1) self.__ldaObj=onlineldavb.OnlineLDA(self.__vocab, self.__K, self.__D, 1./self.__K, 1./self.__K, self.__tau*1.0, self.__kappa)
def main(): """ Downloads and analyzes a bunch of random Wikipedia articles using online VB for LDA. """ dp=dataParse.dataParse("../../data/ideas.txt") des=dp.fieldParse("description") # The number of documents to analyze each iteration batchsize = 10 # The total number of documents in Wikipedia D = 5000 # The number of topics K = 100 # How many documents to look at if (len(sys.argv) < 2): documentstoanalyze = int(D/batchsize) else: documentstoanalyze = int(sys.argv[1]) # Our vocabulary vocab = file('./dictnostops.txt').readlines() W = len(vocab) # Initialize the algorithm with alpha=1/K, eta=1/K, tau_0=1024, kappa=0.7 olda = onlineldavb.OnlineLDA(vocab, K, D, 1./K, 1./K, 1024., 0.7) # Run until we've seen D documents. (Feel free to interrupt *much* # sooner than this.) for iteration in range(0, D): #Retrieve texts docset=des # Give them to online LDA (gamma, bound) = olda.update_lambda(docset) # Compute an estimate of held-out perplexity (wordids, wordcts) = onlineldavb.parse_doc_list(docset, olda._vocab) perwordbound = bound * len(docset) / (D * sum(map(sum, wordcts))) print '%d: rho_t = %f, held-out perplexity estimate = %f' % \ (iteration, olda._rhot, numpy.exp(-perwordbound)) # Save lambda, the parameters to the variational distributions # over topics, and gamma, the parameters to the variational # distributions over topic weights for the articles analyzed in # the last iteration. if (iteration % 10 == 0): numpy.savetxt('lambda-%d.dat' % iteration, olda._lambda) numpy.savetxt('gamma-%d.dat' % iteration, gamma)
def __init__(self,docNum): self.__dp=dataParse.dataParse(os.path.abspath("./data/ideas.txt")) self.__docNum=docNum self.__tags=self.__dp.fieldParse("tags")
#!/usr/bin/env python # -*- coding: utf-8 -*- import socket import select import Queue import time import os import glVariable import dataParse dataParse = dataParse.dataParse() server = socket.socket(socket.AF_INET, socket.SOCK_STREAM) server.setblocking(False) server.setsockopt(socket.SOL_SOCKET, socket.SO_REUSEADDR, 1) server_address = ("0.0.0.0", 14567) server.bind(server_address) server.listen(10) inputs = [server] outputs = [] #message_queues = {} print ("begining") print (glVariable.seqNum) glVariable.seqNum += 1
def __init__(self, docNum): self.__dp = dataParse.dataParse(os.path.abspath("./data/ideas.txt")) self.__docNum = docNum self.__tags = self.__dp.fieldParse("tags")