Exemple #1
0
def main():
    """
    Downloads and analyzes a bunch of random Wikipedia articles using
    batch VB for LDA.
    """

    # The number of documents to analyze each iteration
    batchsize = 8
    # The total number of documents in Wikipedia
    D = 3.3e6
    # The number of topics
    K = 100
    # The size of window
    L = 30

    # How many documents to look at
    if (len(sys.argv) < 2):
        documentstoanalyze = int(D/batchsize)
    else:
        documentstoanalyze = int(sys.argv[1])
        if (len(sys.argv) >= 3):
            L = int(sys.argv[2])
        if (len(sys.argv) >= 4):
            batchsize = int(sys.argv[3])

    # Our vocabulary
    vocab = file('./dictnostops.txt').readlines()
    W = len(vocab)

    # Initialize the algorithm with alpha=0.5, eta=0.5, rho = 10^-3
    olda = batchldavb.batchLDA(vocab, K, D, 0.5, 0.5, 1e-2, -1, L)
    # Run until we've seen D documents. (Feel free to interrupt *much*
    # sooner than this.)
    for iteration in range(0, documentstoanalyze):
        # Download some articles
        (docset, articlenames) = \
            wikirandom.get_random_wikipedia_articles(batchsize)
        # Give them to batch LDA
        (gamma, bound) = olda.update_lambda_docs(docset)
        # Compute an estimate of held-out perplexity
        (wordids, wordcts) = batchldavb.parse_doc_list(docset, olda._vocab)
        perwordbound = bound * len(docset) / (D * sum(map(sum, wordcts)))
        print '%d:  rho_t = %f,  held-out perplexity estimate = %f' % \
            (iteration, olda._rhot, numpy.exp(-perwordbound))

        # Save lambda, the parameters to the variational distributions
        # over topics, and gamma, the parameters to the variational
        # distributions over topic weights for the articles analyzed in
        # the last iteration.
        if (iteration % 10 == 0):
            numpy.savetxt('lambda-%d.dat' % iteration, olda._lambda)
            numpy.savetxt('gamma-%d.dat' % iteration, gamma)
Exemple #2
0
def main():
    """
    Downloads and analyzes a bunch of random Wikipedia articles using
    batch VB for LDA.
    """

    # The number of documents to analyze each iteration
    batchsize = 8
    # The total number of documents in Wikipedia
    D = 3.3e6
    # The number of topics
    K = 100
    # The size of window
    L = 30

    # How many documents to look at
    if (len(sys.argv) < 2):
        documentstoanalyze = int(D / batchsize)
    else:
        documentstoanalyze = int(sys.argv[1])
        if (len(sys.argv) >= 3):
            L = int(sys.argv[2])
        if (len(sys.argv) >= 4):
            batchsize = int(sys.argv[3])

    # Our vocabulary
    vocab = file('./dictnostops.txt').readlines()
    W = len(vocab)

    # Initialize the algorithm with alpha=0.5, eta=0.5, rho = 10^-3
    olda = batchldavb.batchLDA(vocab, K, D, 0.5, 0.5, 1e-2, -1, L)
    # Run until we've seen D documents. (Feel free to interrupt *much*
    # sooner than this.)
    for iteration in range(0, documentstoanalyze):
        # Download some articles
        (docset, articlenames) = \
            wikirandom.get_random_wikipedia_articles(batchsize)
        # Give them to batch LDA
        (gamma, bound) = olda.update_lambda_docs(docset)
        # Compute an estimate of held-out perplexity
        (wordids, wordcts) = batchldavb.parse_doc_list(docset, olda._vocab)
        perwordbound = bound * len(docset) / (D * sum(map(sum, wordcts)))
        print '%d:  rho_t = %f,  held-out perplexity estimate = %f' % \
            (iteration, olda._rhot, numpy.exp(-perwordbound))

        # Save lambda, the parameters to the variational distributions
        # over topics, and gamma, the parameters to the variational
        # distributions over topic weights for the articles analyzed in
        # the last iteration.
        if (iteration % 10 == 0):
            numpy.savetxt('lambda-%d.dat' % iteration, olda._lambda)
            numpy.savetxt('gamma-%d.dat' % iteration, gamma)
def main():
    """
    This function fits Wikipedia articles to the two-hidden-layer model in an online style.
    """

    # The number of documents to analyze in each iteration
    batchsize = 32
    # The estimated total number of documents  
    D = 5.13e6
    # The number of topics
    K1 = 30
    K2 = 3
    eta0 = 1 / np.float(K1)
    eta1 = 1 / np.float(K2)
    eta2 = 1 / np.float(K2) 

    # The total number of iterations
    if (len(sys.argv) < 2):
        M = 100
    else:
        M = int(sys.argv[1])

 
    vocab = file('./dictnostops.txt').readlines()
    W = len(vocab)

    # Initialize the algorithm.
    model = online_wiki_functions.Online_two_hidden_layers(vocab, K1, K2, D, eta0, eta1, eta2, 256, 0.6) 
    for iteration in range(0, M):
        # Download wikipedia articles randomly.
        (docset, articlenames) = \
            wikirandom.get_random_wikipedia_articles(batchsize)
        # Compute the held-out perplexity and fit them to the deep LDA model.
        bound = model.update_lambda_docs(docset) 
        print '%d: held-out perplexity estimate = %f' % \
            (iteration, bound) 

        # Save lambda, the parameters to the variational distributions
        # over topics, and gamma, the parameters to the variational
        # distributions over topic weights for the articles analyzed in
        # the last iteration.
        #if (iteration % 10 == 0):
        #    numpy.savetxt('lambda-%d.dat' % iteration, olda._lambda)
        #    numpy.savetxt('gamma-%d.dat' % iteration, gamma)
    return bound
def main():
    """
    Downloads and analyzes a bunch of random Wikipedia articles using
    online VB for LDA.
    """
    comm = MPI.COMM_WORLD
    size = comm.Get_size()   
    rank = comm.Get_rank()
    # The number of documents to analyze each iteration
    D = 3.3e6
    batchsize = 10
    # The number of topics
    K = 100

    # Our vocabulary
    vocab = file('./dictnostops.txt').readlines()
    W = len(vocab)

    # Initialize the algorithm with alpha=1/K, eta=1/K, tau_0=1024, kappa=0.7
    olda = onlineldavb.OnlineLDA(vocab, K, D, 1./K, 1./K, 1024., 0.7)
    # Run until we've seen D documents. (Feel free to interrupt *much*
    # sooner than this.)
    for iteration in range(3):	# slaves
        # Download some articles
        (docset, articlenames) = \
            wikirandom.get_random_wikipedia_articles(batchsize)
        # Give them to online LDA
        (gamma, bound) = olda.update_lambda(docset)
        # Compute an estimate of held-out perplexity
        (wordids, wordcts) = onlineldavb.parse_doc_list(docset, olda._vocab)
        perwordbound = bound * len(docset) / (D * sum(map(sum, wordcts)))
        print '%d:  rho_t = %f,  held-out perplexity estimate = %f' % \
            (iteration+1, olda._rhot, numpy.exp(-perwordbound))

        # Save lambda, the parameters to the variational distributions
        # over topics, and gamma, the parameters to the variational
        # distributions over topic weights for the articles analyzed in
        # the last iteration.
    gammas = comm.gather(gamma, root = 0)
    lambdas = comm.gather(olda._lambda, root = 0)
    if rank == 0:
	gamma_result = numpy.vstack((x for x in gammas))
	lambda_result = numpy.vstack((x for x in lambdas))
	numpy.savetxt('lambda_parallel.dat', olda._lambda)
        numpy.savetxt('gamma_parallel.dat', gamma)
def main(num_batches, K):
    """
    Downloads and analyzes a bunch of random Wikipedia articles using
    online VB for LDA.

    Arguments:
    - num_batches: the number of batchs to take corpus_size = num_batches * batch_size
    - K : the number of topics, determined from stdin
    """

    # The number of documents to analyze each iteration
    batchsize = 64
    # The total number of documents in Wikipedia
    D = 3.3e6
    
    # Our vocabulary
    vocab = file('./dictnostops.txt').readlines()
    W = len(vocab)

    # Initialize the algorithm with alpha=1/K, eta=1/K, tau_0=1024, kappa=0.7
    olda = onlineldavb.OnlineLDA(vocab, K, D, 1./K, 1./K, 1024., 0.7)
    # Run until we've seen D documents. (Feel free to interrupt *much*
    # sooner than this.)
    for iteration in range(0, num_batches):
        # Download some articles
        (docset, articlenames) = \
            wikirandom.get_random_wikipedia_articles(batchsize)
        # Give them to online LDA
        (gamma, bound) = olda.update_lambda(docset)
        # Compute an estimate of held-out perplexity
        (wordids, wordcts) = onlineldavb.parse_doc_list(docset, olda._vocab)
        perwordbound = bound * len(docset) / (D * sum(map(sum, wordcts)))
        print '%d:  rho_t = %f,  held-out perplexity estimate = %f' % \
            (iteration, olda._rhot, numpy.exp(-perwordbound))

        # Save lambda, the parameters to the variational distributions
        # over topics, and gamma, the parameters to the variational
        # distributions over topic weights for the articles analyzed in
        # the last iteration.
        if (iteration % 10 == 0):
            numpy.savetxt('lambda-%d.dat' % iteration, olda._lambda)
            numpy.savetxt('gamma-%d.dat' % iteration, gamma)
Exemple #6
0
def main(num_batches, K):
    """
    Downloads and analyzes a bunch of random Wikipedia articles using
    online VB for LDA.

    Arguments:
    - num_batches: the number of batchs to take corpus_size = num_batches * batch_size
    - K : the number of topics, determined from stdin
    """

    # The number of documents to analyze each iteration
    batchsize = 64
    # The total number of documents in Wikipedia
    D = 3.3e6

    # Our vocabulary
    vocab = file('./dictnostops.txt').readlines()
    W = len(vocab)

    # Initialize the algorithm with alpha=1/K, eta=1/K, tau_0=1024, kappa=0.7
    olda = onlineldavb.OnlineLDA(vocab, K, D, 1. / K, 1. / K, 1024., 0.7)
    # Run until we've seen D documents. (Feel free to interrupt *much*
    # sooner than this.)
    for iteration in range(0, num_batches):
        # Download some articles
        (docset, articlenames) = \
            wikirandom.get_random_wikipedia_articles(batchsize)
        # Give them to online LDA
        (gamma, bound) = olda.update_lambda(docset)
        # Compute an estimate of held-out perplexity
        (wordids, wordcts) = onlineldavb.parse_doc_list(docset, olda._vocab)
        perwordbound = bound * len(docset) / (D * sum(map(sum, wordcts)))
        print '%d:  rho_t = %f,  held-out perplexity estimate = %f' % \
            (iteration, olda._rhot, numpy.exp(-perwordbound))

        # Save lambda, the parameters to the variational distributions
        # over topics, and gamma, the parameters to the variational
        # distributions over topic weights for the articles analyzed in
        # the last iteration.
        if (iteration % 10 == 0):
            numpy.savetxt('lambda-%d.dat' % iteration, olda._lambda)
            numpy.savetxt('gamma-%d.dat' % iteration, gamma)
def main():
    """
    Downloads and analyzes a bunch of random Wikipedia articles using
    online VB for LDA.
    """
    # The number of documents to analyze each iteration
    batchsize = 64
    # The total number of documents in Wikipedia
    D = 3.3e6
    # The number of topics
    K = 100
    # How many documents to look at
    if (len(sys.argv) < 2):
        documentstoanalyze = int(D/batchsize)
    else:
        documentstoanalyze = int(sys.argv[1])

    # Our vocabulary
        vocab = file('./dictnostops.txt').readlines()
        W = len(vocab)
        
    # Initialize the algorithm with alpha=1/K, eta=1/K, tau_0=1024, kappa=0.7
        olda = onlineldavb.OnlineLDA(vocab, K, D, 1./K, 1./K, 1024., 0.7)

    # Run until we've seen D documents. (Feel free to interrupt *much*
    # sooner than this.)

    for iteration in range(0, documentstoanalyze):
    # Download some articles
       (docset, articlenames) = \
                wikirandom.get_random_wikipedia_articles(batchsize)

    # Give them to online LDA
       (gamma, bound) = olda.update_lambda(docset)

    # Compute an estimate of held-out perplexity
       (wordids, wordcts) = onlineldavb.parse_doc_list(docset, olda._vocab)
        perwordbound = bound * len(docset) / (D * sum(map(sum, wordcts)))
        print '%d: rho_t = %f, held-out perplexity estimate = %f' % \
              (iteration, olda._rhot, numpy.exp(-perwordbound))
def getArticles(nr = 5):
	""" Downloads and analyzes a bunch of random Wikipedia articles """
	(docs, names) = wikirandom.get_random_wikipedia_articles(nr)
	return (docs, names)
Exemple #9
0
def main():
    """
    Downloads and analyzes a bunch of random Wikipedia articles using
    online VB for LDA.
    """

    # The number of documents to analyze each iteration
    batchsize = 64
    # The total number of documents in Wikipedia
    D = 3.3e6
    # The number of topics
    K = 100

    # How many documents to look at
    if (len(sys.argv) < 2):
        documentstoanalyze = int(D/batchsize)
    else:
        documentstoanalyze = int(sys.argv[1])

    # Our vocabulary
    vocab = file('./dictnostops.txt').readlines()
    W = len(vocab)
    
    # Add terms and topics to the DB
    db.init()
    db.add_terms(vocab)
    db.add_topics(K)
    
    # Initialize the algorithm with alpha=1/K, eta=1/K, tau_0=1024, kappa=0.7
    olda = onlineldavb.OnlineLDA(vocab, K, D, 1./K, 1./K, 1024., 0.7)
    # Run until we've seen D documents. (Feel free to interrupt *much*
    # sooner than this.)
    for iteration in range(0, documentstoanalyze):
        # Download some articles
        (docset, articlenames) = \
            wikirandom.get_random_wikipedia_articles(batchsize)
        
        # Give them to online LDA
        (gamma, bound) = olda.update_lambda(docset)
        
        # Compute an estimate of held-out perplexity
        (wordids, wordcts) = onlineldavb.parse_doc_list(docset, olda._vocab)
        
        # Arrays for adding batches of data to the DB
        doc_array = []
        doc_term_array = []
        
        for d in range(len(articlenames)):
            doc_array.append((articlenames[d], docset[d]))
        
        # Add a batch of docs to the DB; this is the one DB task that is not in
        # the separate DB write thread since later tasks depend on having doc ids.
        # Since writes take so long, this also balaces the two threads time-wise.
        doc_ids = db.add_docs(doc_array)
	
        doc_topic_array = []
        for d in range(len(gamma)):
            doc_size = len(docset[d])
            for k in range(len(gamma[d])):
                doc_topic_array.append((doc_ids[d], k, gamma[d][k], gamma[d][k]/doc_size))
        db.add_doc_topics(doc_topic_array)

        perwordbound = bound * len(docset) / (D * sum(map(sum, wordcts)))
        print '%d:  rho_t = %f,  held-out perplexity estimate = %f' % \
            (iteration, olda._rhot, numpy.exp(-perwordbound))

        # Save lambda, the parameters to the variational distributions
        # over topics, and gamma, the parameters to the variational
        # distributions over topic weights for the articles analyzed in
        # the last iteration.
        if (iteration % 10 == 0):
            numpy.savetxt('lambda-%d.dat' % iteration, olda._lambda)
            numpy.savetxt('gamma-%d.dat' % iteration, gamma)
            
            topic_terms_array =[]
            for topic in range(len(olda._lambda)):
                lambda_sum = sum(olda._lambda[topic])
                
                for term in range(len(olda._lambda[topic])):
                    topic_terms_array.append((topic, term, olda._lambda[topic][term]/lambda_sum))
            db.update_topic_terms(K, topic_terms_array)
                
            gc.collect() # probably not necesary, but precautionary for long runs
            db.print_task_update()
        db.increment_batch_count()
    
    # The DB thread ends only when it has both run out of tasks and it has been
    # signaled that it will not be recieving any more tasks
    db.signal_end()
def main():
    """
    Downloads and analyzes a bunch of random Wikipedia articles using
    online VB for LDA.
    """

    # The number of documents to analyze each iteration
    batchsize = 100
    # The total number of documents in Wikipedia
    D = 3.3e6
    # The number of topics
    K = 100
    rho_t_vector = []
    perplexity_vector = []
    time_vector = []
    time1_vector = []

    # How many documents to look at
    if (len(sys.argv) < 2):
        documentstoanalyze = int(D / batchsize)
    else:
        documentstoanalyze = int(sys.argv[1])

    # Our vocabulary
    vocab = file('./dictnostops.txt').readlines()
    W = len(vocab)

    # Initialize the algorithm with alpha=1/K, eta=1/K, tau_0=1024, kappa=0.7

    kappa = 0.7
    olda = onlineldavb.OnlineLDA(vocab, K, D, 1. / K, 1. / K, 1024., kappa)
    # Run until we've seen D documents. (Feel free to interrupt *much*
    # sooner than this.)
    t1 = time.time()
    for iteration in tqdm(range(0, documentstoanalyze)):
        # Download some articles
        (docset, articlenames) = \
            wikirandom.get_random_wikipedia_articles(batchsize)
        # Give them to online LDA
        (gamma, bound) = olda.update_lambda_docs(docset)
        # Compute an estimate of held-out perplexity
        t = time.time()
        (wordids, wordcts) = onlineldavb.parse_doc_list(docset, olda._vocab)
        perwordbound = bound * len(docset) / (D * sum(map(sum, wordcts)))
        print '%d:  rho_t = %f,  held-out perplexity estimate = %f' % \
            (iteration, olda._rhot, numpy.exp(-perwordbound))
        t2 = time.time()
        time_vector.append(t2 - t1)
        if len(time1_vector) == 0:
            time1_vector.append(t2 - t)
        else:
            time1_vector.append(time1_vector[-1] + t2 - t)
        rho_t_vector.append(olda._rhot)
        perplexity_vector.append(perwordbound)

        # Save lambda, the parameters to the variational distributions
        # over topics, and gamma, the parameters to the variational
        # distributions over topic weights for the articles analyzed in
        # the last iteration.
        if (iteration % 10 == 0):
            numpy.savetxt('lambda-%d.dat' % iteration, olda._lambda)
            numpy.savetxt('gamma-%d.dat' % iteration, gamma)

        numpy.savetxt('time_%.1f_%d' % (kappa, batchsize),
                      numpy.array(time_vector))
        numpy.savetxt('rho_%.1f_%d' % (kappa, batchsize),
                      numpy.array(rho_t_vector))
        numpy.savetxt('perplexity_%.1f_%d' % (kappa, batchsize),
                      numpy.array(perplexity_vector))
        numpy.savetxt('time1_%.1f_%d' % (kappa, batchsize),
                      numpy.array(time1_vector))
        print len(wordsInVocab)
        #check length of document and determine whether to bootstrap resample the words
        if len(wordsInVocab) > maxLen or resampleShortDocuments:
            adjustedWordsInVocab = [];
            print 'resampling to length ' + str(maxLen)
            for j in range(0, maxLen):
                adjustedWordsInVocab.append(random.choice(wordsInVocab)) #random sampling WITH replacement
            wordsInVocab = adjustedWordsInVocab
        docset[i] = ' '.join(wordsInVocab) #create final space-separated pre-processed document
    return docset


for i in range(0, length_seed[0]):
    seednum = seednummat[i]
    print seednum
    n.random.seed(int(seednum))

    # Download some articles
    """ Need to do some pre-processing such that each document has less than maximum length N """
    (docset, articlenames) = wikirandom.get_random_wikipedia_articles(int(D))
    print 'enforcing document length requirement for privacy'
    docset = enforceDocumentMaxLength(docset, maxLen, vocabFilename, resampleShortDocuments) #JF: ensure that all documents are no longer than maxLen

    # """ Save the file """
    #the_filename = Data_PATH+'wiki_docs_seednum=%s' %(seednum)
    the_filename = os.path.join(Data_PATH, 'wiki_docs_seednum=%s' %(seednum))
    with open(the_filename, 'wb') as f:
        cPickle.dump(docset, f)


Exemple #12
0
elbo_lst = []
scrape_time = 0.
examples = []
log_likelihoods = []
start_time_loop = time.time()
for t in range(n_iter):
    print '====================BATCH %d====================' % t
    sys.stdout.flush()
    articlenames = []
    n_requested = 0
    mats = []
    while n_requested < batch_size:
        request_size = min(batch_size - n_requested, max_retrieve)
        start_time = time.time()
        articles_temp, articlenames_temp = get_random_wikipedia_articles(
            request_size)
        sys.stdout.flush()
        end_time = time.time()
        scrape_time += end_time - start_time

        mat_temp = vectorizer.transform(articles_temp)
        mats.append(mat_temp)

        articlenames.extend(articlenames_temp)
        n_requested += request_size

        del articles_temp, articlenames_temp
    #mat = vectorizer.transform(articles)
    mat = scipy.sparse.vstack(tuple(mats), format='csr')
    mat = mat[filter(
        lambda d: mat[d].sum() > 1, range(mat.shape[0])
Exemple #13
0
def main():
    # unpack input arguments
    # seednum = 1
    # documentstoanalyze  = 2000
    # batchsize = 1000
    # priv = 0
    # epsilon = 1
    # comp = 2

    seednum = int(sys.argv[1])
    documentstoanalyze = int(sys.argv[2])
    batchsize = int(sys.argv[3])
    priv = int(sys.argv[4])  # 1 is private version, 0 is nonprivate version
    epsilon = float(sys.argv[5])  # total privacy budget
    comp = int(sys.argv[6])  # 0 conventional, 1 advanced, 2 CDP

    # The number of topics
    K = 100
    # D = 1000000
    D = 5000000

    nu = batchsize / float(D)  # sampling rate
    numpy.random.seed(seednum)

    print('seednum %s mini-batchsize %s and number of iter %s' %
          (seednum, batchsize, documentstoanalyze))

    # Our vocabulary
    vocab = file('./dictnostops.txt').readlines()
    W = len(vocab)

    gamma_noise = 0  # will use Laplace noise all the time

    if comp == 2:
        # budget = numpy.sqrt(epsilon/float(documentstoanalyze))
        # budget = numpy.sqrt(epsilon*D/float(2*batchsize))
        budget = numpy.sqrt(2 * epsilon) / float(
            2 * nu * numpy.sqrt(documentstoanalyze))
    elif comp == 1:
        delta = 0.000001
        budget = epsilon / float(
            4 * nu * numpy.sqrt(2 * documentstoanalyze * numpy.log(1 / delta)))
    else:
        # budget = epsilon/float(documentstoanalyze)
        budget = epsilon / float(2 * documentstoanalyze * nu)

    if priv:
        print('private version')

    olda = onlineldavb.OnlineLDA(vocab, K, D, 1. / K, 1. / K, 1024., 0.7, priv,
                                 budget, gamma_noise)

    # the_filename = Data_PATH+'wiki_data'
    # with open(the_filename, 'rb') as f:
    #     docset = cPickle.load(f)

    # load all the documents
    # docset = []
    # for whichdoc in range(1, 21):
    #     the_filename = Data_PATH+'wikidata_seednum=_%s' %(whichdoc)
    #     with open(the_filename, 'rb') as f:
    #         docset1 = cPickle.load(f)
    #         docset = docset + docset1
    #         print "docset %s is loaded" %(whichdoc)
    #
    # print "docset all loaded"

    perplexity = numpy.zeros(documentstoanalyze)
    # D_test = 10000

    # for iteration in range(0, maxIter):
    for iteration in range(0, documentstoanalyze):
        # subset of data
        # rand_perm_nums =  numpy.random.permutation(len(docset))
        # idx_minibatch = rand_perm_nums[0:batchsize]
        # docsubset = list(docset[i] for i in idx_minibatch)

        # Download some articles
        (docset, articlenames) = \
         wikirandom.get_random_wikipedia_articles(batchsize)
        # Give them to online LDA
        (gamma, bound) = olda.update_lambda_docs(docset)
        # Compute an estimate of held-out perplexity
        (wordids, wordcts) = onlineldavb.parse_doc_list(docset, olda._vocab)
        perwordbound = bound * len(docset) / (D * sum(map(sum, wordcts)))
        print('%d:  rho_t = %f,  held-out perplexity estimate = %f' % \
              (iteration, olda._rhot, numpy.exp(-perwordbound)))

        # # Give them to online LDA
        # (gamma, bound) = olda.update_lambda_docs(docsubset)
        # # Compute an estimate of held-out perplexity
        # (wordids, wordcts) = onlineldavb.parse_doc_list(docsubset, olda._vocab)
        # perwordbound = bound * len(docsubset) / (D * sum(map(sum, wordcts)))
        # print '%d:  rho_t = %f,  training perplexity estimate = %f' % \
        #     (iteration, olda._rhot, numpy.exp(-perwordbound))

        # compute test perplexity
        # idx_test = rand_perm_nums[batchsize+1:batchsize+1+D_test]
        # doctest = list(docset[i] for i in idx_test)
        #
        # (gamma_test, ss) = olda.do_e_step_docs(doctest)
        # # Estimate held-out likelihood for current values of lambda.
        # bound_test = olda.approx_bound_docs(doctest, gamma_test)
        # (wordids, wordcts_test) = onlineldavb.parse_doc_list(doctest, olda._vocab)
        #
        # # perwordbound_test = bound_test*D_test / float(D*sum(map(sum, wordcts_test)))
        # perword_test_log_likelihood = bound_test / float(sum(map(sum, wordcts_test)))
        # print '%d:  rho_t = %f,  test perplexity estimate = %f' % \
        #     (iteration, olda._rhot, perword_test_log_likelihood)

        perplexity[iteration] = numpy.exp(-perwordbound)

    # save perplexity
    if priv:
        # if gamma_noise:
        #     method = 'private_epsilon_%s_cdp_%s_gamma_noise_%s' % (epsilon, cdp, gamma_noise)
        # else:
        #     method = 'private_epsilon_%s_cdp_%s' %(epsilon, cdp)
        method = 'private_seed=%s_J=%s_S=%s_priv=%s_epsilon=%s_compo=%s' % (
            sys.argv[1], sys.argv[2], sys.argv[3], sys.argv[4], sys.argv[5],
            sys.argv[6])
    else:
        method = 'Nonprivate_seed=%s_J=%s_S=%s_priv=%s_epsilon=%s_compo=%s' % (
            sys.argv[1], sys.argv[2], sys.argv[3], sys.argv[4], sys.argv[5],
            sys.argv[6])

    numpy.save(method + '.npy', perplexity)
    # method = 'private_epsilon_1'
    # filename = method+'_D=_%s_S=_%s' %(D, batchsize)
    # numpy.save(filename+'.npy', test_log_likelihood)

    # save lambda and gamma
    numpy.savetxt(method + '_lambda.dat', olda._lambda)
    numpy.savetxt(method + '_gamma.dat', gamma)
Exemple #14
0
def getArticles(nr=5):
    """ Downloads and analyzes a bunch of random Wikipedia articles """
    (docs, names) = wikirandom.get_random_wikipedia_articles(nr)
    return (docs, names)
def main(batchnumber = 3.3e4 ):
    """
    Downloads and analyzes a bunch of random Wikipedia articles using
    online VB for LDA.
    """

    # The number of documents to analyze each iteration
    batchsize = 64*8
    # The total number of documents in Wikipedia
    D = 3.3e6
    # The number of topics
    K = 100

    documentstoanalyze = batchnumber

    # Our vocabulary
    vocab = file('./dictnostops.txt').readlines()
    W = len(vocab)
    # record time used for training
    start = time.time()
    # Initialize the algorithm with alpha=1/K, eta=1/K, tau_0=1024, kappa=0.7
    olda = onlineldavb.OnlineLDA(vocab, K, D, 1./K, 1./K, 1024., 0.7)
    # Run until we've seen D documents. (Feel free to interrupt *much    # sooner than this.)
    perplexity_plot = list()
    perplexity = []
    time_track = list()
    for iteration in range(1, documentstoanalyze+1):
        # Download some articles
        (docset, articlenames) = \
            wikirandom.get_random_wikipedia_articles(batchsize)
        # Give them to online LDA
        bound = olda.update_lambda(docset)
        # Compute an estimate of held-out perplexity
        perwordbound = bound * len(docset) / (D * sum(map(sum, olda._wordcts)))
        tmp = numpy.exp(-perwordbound)
        if iteration == 1 :
            perplexity = tmp
        elif (tmp - perplexity)>50 :
            perplexity = perplexity + 50
        else:
            perplexity = tmp
        perplexity_plot.append(perplexity)
        time_track.append(time.time()-start)
        print '%d:  rho_t = %f,  held-out perplexity estimate = %f' % \
            (iteration, olda._rhot, numpy.exp(-perwordbound))
            
    numpy.savetxt('lambda.dat', olda._lambda)

    #print time taken, save time to file
    end = time.time()
    time_track_file = open("time_track.txt","w")
    for item in time_track:
        time_track_file.write("%s\n"% item)
    time_track_file.close()
    print "time taken for training %f" % (end-start)
    perplexity_file = open("perplexity.txt","w")
    for per in perplexity_plot:
        perplexity_file.write("%s\n"% per)
    perplexity_file.close()
    #plot perplexity
    plt.figure(1)
    plt.plot(range(len(perplexity_plot)), perplexity_plot, 'g')
    plt.xlabel('Number of Iterations')
    plt.ylabel('Perplexity')
    #plt.show()
    #plt.pause(100)
    plt.savefig("perplexity%s.png" % batchnumber)

    plt.figure(2)
    plt.plot(time_track, perplexity_plot, 'g')
    plt.xlabel('Time in seconds')
    plt.ylabel('Perplexity')
    #plt.show()
    #plt.pause(100)
    plt.savefig("time_track%s.png" % batchnumber)