def __init__(self,batchsize,d,k,tau,kappa):
     self.__dp=dataParse.dataParse(os.path.abspath("./data/ideas.txt"))
     self.__result=self.__dp.concatedField(os.path.abspath("./data/fieldList.txt"))
     #doc
     self.__doc=self.__result[0]
     #fielddata
     self.__fid=self.__result[1]
     #dictionary
     self.__vocab=file(os.path.abspath('./data/vocabulary.txt')).readlines()
     # the number of words in the dictionary
     self.__W=len(self.__vocab)
     #the number of documents to analyze in each iteration
     self.__batchsize=batchsize
     # the total number of documents
     self.__D=d
     # the number of topics
     self.__K=k
     # the number of iterations
     self.__documentstoanalyze=self.__D/self.__batchsize
     # tau
     self.__tau=tau
     # kappa
     self.__kappa=kappa
     # lda instance (alpha=1/K, eta=1/K, tau_0=1024, kappa=0.1)
     self.__ldaObj=onlineldavb.OnlineLDA(self.__vocab, self.__K, self.__D, 1./self.__K, 1./self.__K, self.__tau*1.0, self.__kappa)
Esempio n. 2
0
def main():
    """
    Downloads and analyzes a bunch of random Wikipedia articles using
    online VB for LDA.
    """
    dp=dataParse.dataParse("../../data/ideas.txt")

    des=dp.fieldParse("description")



    # The number of documents to analyze each iteration
    batchsize = 10
    # The total number of documents in Wikipedia
    D = 5000
    # The number of topics
    K = 100

    # How many documents to look at
    if (len(sys.argv) < 2):
        documentstoanalyze = int(D/batchsize)
    else:
        documentstoanalyze = int(sys.argv[1])

    # Our vocabulary
    vocab = file('./dictnostops.txt').readlines()
    W = len(vocab)

    # Initialize the algorithm with alpha=1/K, eta=1/K, tau_0=1024, kappa=0.7
    olda = onlineldavb.OnlineLDA(vocab, K, D, 1./K, 1./K, 1024., 0.7)
    # Run until we've seen D documents. (Feel free to interrupt *much*
    # sooner than this.)
    for iteration in range(0, D):
        #Retrieve texts
        docset=des
        # Give them to online LDA
        (gamma, bound) = olda.update_lambda(docset)
        # Compute an estimate of held-out perplexity
        (wordids, wordcts) = onlineldavb.parse_doc_list(docset, olda._vocab)
        perwordbound = bound * len(docset) / (D * sum(map(sum, wordcts)))
        print '%d:  rho_t = %f,  held-out perplexity estimate = %f' % \
            (iteration, olda._rhot, numpy.exp(-perwordbound))

        # Save lambda, the parameters to the variational distributions
        # over topics, and gamma, the parameters to the variational
        # distributions over topic weights for the articles analyzed in
        # the last iteration.
        if (iteration % 10 == 0):
            numpy.savetxt('lambda-%d.dat' % iteration, olda._lambda)
            numpy.savetxt('gamma-%d.dat' % iteration, gamma)
Esempio n. 3
0
 def __init__(self,docNum):
     self.__dp=dataParse.dataParse(os.path.abspath("./data/ideas.txt"))
     self.__docNum=docNum
     self.__tags=self.__dp.fieldParse("tags")
Esempio n. 4
0
#!/usr/bin/env python
# -*- coding: utf-8 -*-

import socket
import select
import Queue
import time
import os


import glVariable
import dataParse

dataParse = dataParse.dataParse()


server = socket.socket(socket.AF_INET, socket.SOCK_STREAM)
server.setblocking(False)

server.setsockopt(socket.SOL_SOCKET, socket.SO_REUSEADDR, 1)
server_address = ("0.0.0.0", 14567)
server.bind(server_address)
server.listen(10)

inputs = [server]
outputs = []
#message_queues = {}

print ("begining")
print (glVariable.seqNum)
glVariable.seqNum += 1
Esempio n. 5
0
 def __init__(self, docNum):
     self.__dp = dataParse.dataParse(os.path.abspath("./data/ideas.txt"))
     self.__docNum = docNum
     self.__tags = self.__dp.fieldParse("tags")