Esempio n. 1
0
def main(argv):

    doc_list = []

    argList = handleArgs(argv)
    #list the docs in pickledDocs folder
    p = "../data/pickledDocs/"
    l = listdir(p)
    fileList = [p + f for f in l]

    #for each pickled doclist, append all docs to master doclist
    for fi in fileList:
        with open(fi, 'rb') as d:
            docs = cPickle.load(d)
        for k, x in docs.iteritems():
            doc_list.append(x)
        print len(doc_list)

    #D is total number of docs to show to the model, K is number of topics
    goal_its = 80  #number of iterations to run LDA
    corp_size = len(doc_list)  #number of documents in the corpus
    D = corp_size * goal_its  #number of documents expected to see
    K = 10  #default topic value, if none given in parameters
    saveModel = False  #whether to save LDA model itself
    desc = ""  #for performing non-standard runs
    version = ""  #for having multiple models with same parameters
    hyper_param = ""  #for testing hyperparameters

    #define the vocabulary file we will be using
    vocab = helper_funcs.read_dict("../data/dictionary.txt")  #default dict

    #initialize an instance of the OnlineLDA algorithm
    #parameters - dictionary, num topics, learning rate, beta, tau, kappa
    #if the path to an OnlineLDA pickle is passed, it re-opens that pickle

    K = int(argList[0])
    vocab = vocab = str.split(file(argList[1]).read())
    if not (argList[2] is None):
        alpha = argList[2]
    else:
        alpha = 0.1
    if not (argList[3] is None):
        beta = argList[3]
    else:
        beta = 1.

    saveModel = False
    lda = onlineldavb.OnlineLDA(vocab, K, D, alpha, beta, 1024, 0.)
    print "created LDA with parameters:\nnumwords: " + str(
        len(vocab)) + "\n#topics: " + str(K) + "\nalpha: " + str(
            alpha) + "\nbeta: " + str(beta)

    paramTitle = hyper_param + str(
        len(vocab) / 1000) + "kwords_" + str(K) + "topics"

    folder = "../data/out/models/" + paramTitle
    if not isdir(folder):
        mkdir(folder)

    W = len(vocab)

    print "dictionary size: " + str(W)
    print paramTitle

    print folder
    #if desc.find("label") > -1:
    #    with open("../data/out/past_models/"+paramTitle+"/dictionary.txt",'wb') as f:
    #        voc = sorted(vocab.items(),key=operator.itemgetter(1))
    #        for x in voc:
    #            f.write(x[0]+"\n")
    #perform LDA on the document list for goal_its iterations, updating lambda
    for i in range(lda._updatect, goal_its):
        print doc_list
        print i
        (gamma, bound) = lda.update_lambda(doc_list)

        (wordids, wordcts) = onlineldavb.parse_doc_list(doc_list, lda._vocab)
        perwordbound = bound * len(doc_list) / (D * sum(map(sum, wordcts)))
        print np.exp(-perwordbound)

        #pickle the model and its output occasionally
        if (i + 1) == goal_its:
            if not isdir(folder):
                mkdir(folder)
            with open(folder + "/gamma.pickle", 'wb') as f:
                cp2 = cPickle.Pickler(f)
                cp2.dump(gamma)
            with open(folder + "/lambda.pickle", 'wb') as f:
                cp = cPickle.Pickler(f)
                cp.dump(lda._lambda)
            np.savetxt(folder + '/lambda.dat', lda._lambda)

            if saveModel:

                with open(folder + "/LDA.pickle", 'wb') as f:
                    cp3 = cPickle.Pickler(f)
                    cp3.dump(lda)
Esempio n. 2
0
        doc_list.append(d.read())
    
    

#D is total number of docs to show to the model, K is number of topics
goal_its    =   40              #number of iterations to run the LDA process
corp_size   =   len(doc_list)   #number of documents in the corpus
D           =   corp_size       #number of documents expected to see
K           =   70              #Default topic value, if none given in parameters
saveModel   =   False           #whether to save LDA model itself, lambda
desc        =   ""              #for performing non-standard runs
version     =   ""              #for having multiple models with same parameters
hyper_param =   ""              #for testing hyperparameters

#define the vocabulary file we will be using
vocab       =   helper_funcs.read_dict("../data/dictionary.txt") #default dict size


#initialize an instance of the OnlineLDA algorithm
#parameters - dictionary, num topics, learning rate, eta, tau, kappa
#if the path to an OnlineLDA pickle is passed, it re-opens that pickle
if len(sys.argv) > 2:
    K           =   int(sys.argv[1])
    vocab       =   vocab = str.split(file(sys.argv[2]).read())
    alpha   =   0.1
    eta     =   1.
    if len(sys.argv) == 4:
        folder  =   sys.argv[3]
    saveModel   =   False
    lda         =   onlineldavb.OnlineLDA(vocab,K,D,alpha,eta,1024,0.)
    print "created LDA with parameters:\n#topics: "+str(K)+"\nalpha: "+str(alpha)+"\neta: "+str(eta)
Esempio n. 3
0
def main(argv):
    
    doc_list    =   []
    
    argList     = handleArgs(argv)
    #list the docs in pickledDocs folder
    p   =   "../data/pickledDocs/"
    l   =   listdir(p)
    fileList    =   [p+f for f in l]
    
    #for each pickled doclist, append all docs to master doclist
    for fi in fileList:
        with open(fi,'rb') as d:
            docs    =   cPickle.load(d)
        for k,x in docs.iteritems(): 
            doc_list.append(x)
        print len(doc_list)
        
        
    
    #D is total number of docs to show to the model, K is number of topics
    goal_its    =   80                #number of iterations to run LDA 
    corp_size   =   len(doc_list)       #number of documents in the corpus
    D           =   corp_size*goal_its  #number of documents expected to see
    K           =   10                  #default topic value, if none given in parameters
    saveModel   =   False               #whether to save LDA model itself
    desc        =   ""                  #for performing non-standard runs
    version     =   ""                  #for having multiple models with same parameters
    hyper_param =   ""                  #for testing hyperparameters
    
    #define the vocabulary file we will be using
    vocab       =   helper_funcs.read_dict("../data/dictionary.txt") #default dict 
    
    
    #initialize an instance of the OnlineLDA algorithm
    #parameters - dictionary, num topics, learning rate, beta, tau, kappa
    #if the path to an OnlineLDA pickle is passed, it re-opens that pickle

    K           =   int(argList[0])
    vocab       =   vocab = str.split(file(argList[1]).read())
    if not (argList[2] is None):
        alpha   = argList[2]
    else:
        alpha   =   0.1
    if not (argList[3] is None):
        beta    = argList[3]
    else:
        beta     =   1.

    saveModel   =   False
    lda         =   onlineldavb.OnlineLDA(vocab,K,D,alpha,beta,1024,0.)
    print "created LDA with parameters:\nnumwords: "+str(len(vocab))+"\n#topics: "+str(K)+"\nalpha: "+str(alpha)+"\nbeta: "+str(beta)
           
    paramTitle  =   hyper_param+str(len(vocab)/1000)+"kwords_"+str(K)+"topics"
    
    folder  = "../data/out/models/"+paramTitle
    if not isdir(folder):
        mkdir(folder)
    
    W           =   len(vocab)
    
    print "dictionary size: " + str(W)
    print paramTitle
    
    
    
    print folder
    #if desc.find("label") > -1:
    #    with open("../data/out/past_models/"+paramTitle+"/dictionary.txt",'wb') as f:
    #        voc = sorted(vocab.items(),key=operator.itemgetter(1))
    #        for x in voc:
    #            f.write(x[0]+"\n")
    #perform LDA on the document list for goal_its iterations, updating lambda
    for i in range(lda._updatect,goal_its):
        print doc_list
        print i
        (gamma, bound)      = lda.update_lambda(doc_list)
        
        (wordids, wordcts)  = onlineldavb.parse_doc_list(doc_list,lda._vocab)
        perwordbound        = bound * len(doc_list) / (D*sum(map(sum,wordcts)))
        print np.exp(-perwordbound)
        
        #pickle the model and its output occasionally
        if (i+1) == goal_its:
            if not isdir(folder):
                mkdir(folder)
            with open(folder+"/gamma.pickle",'wb') as f:
                cp2 = cPickle.Pickler(f)
                cp2.dump(gamma)
            with open(folder+"/lambda.pickle",'wb') as f:
                cp  = cPickle.Pickler(f)
                cp.dump(lda._lambda)
            np.savetxt(folder+'/lambda.dat', lda._lambda)
            
            
            if saveModel:
                
                with open(folder+"/LDA.pickle",'wb') as f:
                    cp3 = cPickle.Pickler(f)
                    cp3.dump(lda)