Esempio n. 1
0
    def process(self, docset):
        # Give them to online LDA
        (gamma, bound) = self.old_alpha.update_lambda_docs(docset)
        # Compute an estimate of held-out perplexity
        (wordids,
         wordcts) = onlineldavb.parse_doc_list(docset, self.old_alpha._vocab)
        perwordbound = bound * len(docset) / (self.D * sum(map(sum, wordcts)))
        print("%d:  rho_t = %f,  held-out perplexity estimate = %f" %
              (self.iteration, self.old_alpha._rhot, numpy.exp(-perwordbound)))

        # Save lambda, the parameters to the variational distributions
        # over topics, and gamma, the parameters to the variational
        # distributions over topic weights for the articles analyzed in
        # the last iteration.
        if (self.iteration % 10 == 0):
            numpy.savetxt('lambda-%d.dat' % self.iteration,
                          self.old_alpha._lambda)
            numpy.savetxt('gamma-%d.dat' % self.iteration, gamma)

            # This prints the top words of the topics after each mini batch
            for k in range(0, len(self.old_alpha._lambda)):
                lambdak = list(self.old_alpha._lambda[k, :])
                lambdak = lambdak / sum(lambdak)
                temp = zip(lambdak, range(0, len(lambdak)))
                temp = sorted(temp, key=lambda x: x[0], reverse=True)
                print('topic %d:' % (k))
                # feel free to change the "53" here to whatever fits your screen nicely.
                for i in range(0, 5):
                    print('%20s  \t---\t  %.4f' %
                          (self.vocab[temp[i][1]], temp[i][0]))
                print()
        self.iteration = self.iteration + 1
Esempio n. 2
0
def main():
    """
    Downloads and analyzes a bunch of random Wikipedia articles using
    online VB for LDA.
    """

    # The number of documents to analyze each iteration
    #batchsize = 64
    batchsize = 32
    # The total number of tweets
    #D=297861
    D = 1163
    # The number of topics
    #K = 20
    K = 10

    # How many documents to look at
    if (len(sys.argv) < 2):
        documentstoanalyze = int(D/batchsize)
    else:
        documentstoanalyze = int(sys.argv[1])

    # Our vocabulary
    vocab = file('dictnostops.txt').readlines()
    W = len(vocab)
    
    #open rawdata
    #train_file = open("congress_train.txt")
    train_file = open("text.txt")
    train = train_file.readlines()

    # Initialize the algorithm with alpha=1/K, eta=1/K, tau_0=1024, kappa=0.7
    #olda = onlineldavb.OnlineLDA(vocab, K, D, 1./K, 1./K, 1024., 0.7)
    olda = onlineldavb.OnlineLDA(vocab, K, D, 1./K, 1./K, 128., 0.7)
    # Run until we've seen D documents. (Feel free to interrupt *much*
    # sooner than this.)
    iter = 0
    for iteration in range(0, documentstoanalyze):
        # Download some articles
        #(docset, articlenames) = \
            #wikirandom.get_random_wikipedia_articles(batchsize)
        docset = train[iter:(iter+batchsize)]
        iter+=batchsize
        # Give them to online LDA
        (gamma, bound) = olda.update_lambda(docset)
        # Compute an estimate of held-out perplexity
        (wordids, wordcts) = onlineldavb.parse_doc_list(docset, olda._vocab)
        print wordids
        print wordcts
        perwordbound = bound * len(docset) / (D * sum(map(sum, wordcts)))
        print '%d:  rho_t = %f,  held-out perplexity estimate = %f' % \
            (iteration, olda._rhot, numpy.exp(-perwordbound))

        # Save lambda, the parameters to the variational distributions
        # over topics, and gamma, the parameters to the variational
        # distributions over topic weights for the articles analyzed in
        # the last iteration.
        if (iteration % 10 == 0):
            numpy.savetxt('lambda-%d.dat' % iteration, olda._lambda)
            numpy.savetxt('gamma-%d.dat' % iteration, gamma)
Esempio n. 3
0
def main(doc_list, vocab_file):
    batch_size = 64
    D = len(doc_list) # number of documents
    K = 100 # number of topics

    vocab = file(vocab_file).readlines()
    W = len(vocab)

    # Initialize the algorithm with alpha=1/K, eta=1/K, tau_0=1024, kappa=0.7
    olda = onlineldavb.OnlineLDA(vocab, K, D, 1./K, 1./K, 1024., 0.7)
    # Run until we've seen D documents.
    for iteration in range(0, D):
        doc_set = doc_list[batch_size*iteration:(iteration+1)*batch_size]
        # Give them to online LDA
        (gamma, bound) = olda.update_lambda_docs(docset)
        # Compute an estimate of held-out perplexity
        (wordids, wordcts) = onlineldavb.parse_doc_list(docset, olda._vocab)
        perwordbound = bound * len(docset) / (D * sum(map(sum, wordcts)))
        print '%d:  rho_t = %f,  held-out perplexity estimate = %f' % (iteration, olda._rhot, numpy.exp(-perwordbound))

        # Save lambda, the parameters to the variational distributions
        # over topics, and gamma, the parameters to the variational
        # distributions over topic weights for the articles analyzed in
        # the last iteration.
        if (iteration % 10 == 0):
            numpy.savetxt('lambda-%d.dat' % iteration, olda._lambda)
            numpy.savetxt('gamma-%d.dat' % iteration, gamma)
    def runLDA(self):
        for iteration in range(0, self.__documentstoanalyze):
            #Retrieve texts
            docset=self.__doc[iteration*self.__batchsize:iteration*self.__batchsize+self.__batchsize]

            # Give them to online LDA
            (gamma, bound) = self.__ldaObj.update_lambda(docset)


            # Compute an estimate of held-out perplexity
            (wordids, wordcts) = onlineldavb.parse_doc_list(docset, self.__ldaObj._vocab)
            perwordbound = bound * len(docset) / (self.__documentstoanalyze * sum(map(sum, wordcts)))
            print '%d:  rho_t = %f,  held-out perplexity estimate = %f' % \
            (iteration, self.__ldaObj._rhot, numpy.exp(-perwordbound))


            # Save a temporary lambda for this iteration
            temp_lambda = self.__ldaObj._lambda

            # Save lambda and gamma
            if (iteration == 0):
                self.__lambda_all = temp_lambda
            else:
                self.__lambda_all = numpy.concatenate((self.__lambda_all,temp_lambda), axis=0)

            if (iteration == 0):
                self.__gamma_all = gamma
            else:
                self.__gamma_all = numpy.concatenate((self.__gamma_all,gamma), axis=0)

            numpy.savetxt('./data/lambda.dat', self.__lambda_all)
            numpy.savetxt('./data/gamma.dat', self.__gamma_all)
def scanAllDocs(cursor):
    n = 0
    row = cursor.fetchone()
    for iteration in range(0, documentstoanalyze + 1):
        # Download some articles
        docset = list()
        i = 0
        while row is not None and i < BATCHSIZE:
            docset.append(row.MyText)
            row = cursor.fetchone()
            i += 1
        n += i
        print ("Docs analyzed: ", n)
        # Give them to online LDA
        (gamma, bound) = olda.update_lambda(docset)
        # Compute an estimate of held-out perplexity
        (wordids, wordcts) = onlineldavb.parse_doc_list(docset, olda._vocab)
        perwordbound = bound * len(docset) / (D * sum(map(sum, wordcts)))
        print ('%d:  rho_t = %f,  held-out perplexity estimate = %f' % \
            (iteration, olda._rhot, numpy.exp(-perwordbound)))

        # Save lambda, the parameters to the variational distributions
        # over topics, and gamma, the parameters to the variational
        # distributions over topic weights for the articles analyzed in
        # the last iteration.
        if (iteration % 100 == 0):
            numpy.savetxt('lambda_{0}-{1}.dat'.format(K, iteration), olda._lambda)
            numpy.savetxt('gamma_{0}-{1}.dat'.format(K, iteration), gamma)

        if n >= D-1:
            numpy.savetxt('lambda_{0}-final.dat'.format(K), olda._lambda)
            numpy.savetxt('gamma_{0}-final.dat'.format(K), gamma)
            break
Esempio n. 6
0
    def run_lda(meaningful_words_path, batchsize, K, GAMMA_ITER_TIMES):
        t = Task.create_new_lda_task()
        global file_counts, g_all_files
        file_counts = calc_file_counts()
        D = file_counts
        t.status = Task.TASK_STATUS_STARTED
        t.save()

        try: 
            # Remove the formmer results
            # TODO: save it in dababases
            os.popen("rm yls_app/tools/lambda*")
            os.popen("rm yls_app/tools/gamma*")
            # How many documents to look at
            documentstoanalyze = int(D/batchsize) + 1
            print 'target iteration %d'%(documentstoanalyze)
            # Our vocabulary, we didn't use vocabulary.
            vocab = read_vocab(meaningful_words_path) 

            # Keep track of the last iteration
            last_iteration_perplexity = 0.0

            # Initialize the algorithm with alpha=1/K, eta=1/K, tau_0=1024, kappa=0.7
            olda = onlineldavb.OnlineLDA(vocab, K, D, 1./K, 1./K, 1024., 0.7, GAMMA_ITER_TIMES)
            # Run until we've seen all documents.
            for iteration in range(documentstoanalyze):
                # print 'iteration ... %d'%iteration
                # Download some articles
                docset = get_article(iteration * batchsize, batchsize)
                # Give them to online LDA
                (gamma, bound) = olda.update_lambda(docset)
                # Compute an estimate of held-out perplexity
                (wordids, wordcts) = onlineldavb.parse_doc_list(docset, olda._vocab)
                perwordbound = bound * len(docset) / (D * sum(map(sum, wordcts)))
                t.infomation = '%d:  rho_t = %f,  held-out perplexity estimate = %f' % \
                    (iteration, olda._rhot, numpy.exp(-perwordbound))
                last_iteration_perplexity = numpy.exp(-perwordbound)
                print perwordbound,bound
                if iteration == documentstoanalyze -1:
                    break
                print '%d,%f'%(iteration,last_iteration_perplexity)

                t.status = Task.TASK_STATUS_STARTED
                t.save()
                #print 'perplexity: %f'%(last_iteration_perplexity)

                # Save lambda, the parameters to the variational distributions
                # over topics, and gamma, the parameters to the variational
                # distributions over topic weights for the articles analyzed in
                # the last iteration.
                #if (iteration % 10 == 0):
                    #numpy.savetxt('lambda-%d.dat' % iteration, olda._lambda)
                    #numpy.savetxt('gamma-%d.dat' % iteration, gamma)
                    #
                numpy.savetxt(LDARunner.LAMBDA_FILE, olda._lambda)
                numpy.savetxt(LDARunner.GAMMA_FILE, gamma)
        except Exception,e:
            print e
            t.infomation = "Exception:" + e.message
            Task.finish_task(t, False)
Esempio n. 7
0
def main():
    # LDA: a documents contains all the keywords of some journal/conference
    # equivalent to cluster keywords over journals/conferences
    journal_or_conference = sys.argv[1]
    num = int(sys.argv[2])
    conn = jcke.get_db_conn()

    # default (num <=0 or > max number): figure out all the journals/conferences keywords
    if num <= 0 or (num >= 15151 and journal_or_conference == "journal") or (num >= 4545 and journal_or_conference == "conference"):
	query = """
		SELECT COUNT(*) FROM ##journal_or_conference##
		"""
	query = query.replace("##journal_or_conference##", journal_or_conference)
	conn.cursor.execute(query)
	num = conn.cursor.fetchall()	# number of journals/conferences to process

    # document parsing
    journal_conf_list = os.listdir("journal_conf_keyword")
    # check if txt files exist, then generate those docs
    if not num == len(journal_conf_list):
	jcke.journal_conf_keyword_generation(conn, num, journal_or_conference)
	journal_conf_list = os.listdir("journal_conf_keyword")
    
    # The number of journal/conference keyword sets in each batch
    batchsize = lambda num: num if num <= 100 else 100
    batch = batchsize(num)
    iteration_times = int(num/batch)
    # The total number of journals/conferences
    DocNum = lambda journal_or_conference: 15151 if journal_or_conference == "journal" else 4545
    D = DocNum(journal_or_conference)
    # The number of topics
    K = 100	# maybe some other numbers
    # Our vocabulary : we need some vocabulary set!
    vocab = dict()
    # Initialize the algorithm with alpha=1/K, eta=1/K, tau_0=1024, kappa=0.7
    online_LDA = lda.OnlineLDA(vocab, K, D, 1./K, 1./K, 1024., 0.7)

    # Run until we've seen D documents. (Feel free to interrupt *much* sooner than this.)
    for iteration in range(0, iteration_times):
	# getting documents (keyword sets)
	if iteration != iteration_times - 1:
    	    journal_conf_keyword_list = jcke.input_journal_conf_keywords(journal_conf_list[iteration * batch : (iteration + 1) * batch])
	else:
	    journal_conf_keyword_list = jcke.input_journal_conf_keywords(journal_conf_list[iteration * batch :])
        # online LDA for keyword sets
	# here we update the relative function in the package	(dangerous!)	
	online_LDA._vocab = jcke.vocabulary_generation(journal_conf_keyword_list, online_LDA._vocab)
	
        (gamma, bound) = online_LDA.update_lambda(journal_conf_keyword_list)
        # Compute an estimate of held-out perplexity
        (keywordids, keywordcts) = lda.parse_doc_list(journal_conf_keyword_list, online_LDA._vocab)
        perkeywordbound = bound * len(journal_conf_keyword_list) / (D * sum(map(sum, keywordcts)))
        print '%d:  rho_t = %f,  held-out perplexity estimate = %f' % \
            (iteration, online_LDA._rhot, numpy.exp(-perkeywordbound))

    # Save lambda, the parameters to the variational distributions over topics, and gamma, the parameters to the variational
    # distributions over topic weights for the articles analyzed in the last iteration.
    numpy.savetxt('lambda-%s.dat' % journal_or_conference, online_LDA._lambda)
    numpy.savetxt('gamma-%s.dat' % journal_or_conference, gamma)
Esempio n. 8
0
def main():

    articles = list()
    artnames = list()

    for line in file('./jacm/withIDAbstracts.txt').readlines():
        combo = line.split('\t')
        artnames.append(combo[0])
        articles.append(combo[1])
    """
    Downloads and analyzes a bunch of random Wikipedia articles using
    online VB for LDA.
    """

    # The number of documents to analyze each iteration
    batchsize = 1
    # The total number of documents in Wikipedia
    D = len(artnames)
    # The number of topics
    K = 54

    # How many documents to look at

    documentstoanalyze = len(artnames)

    # Our vocabulary
    vocab = file('./dictnostops.txt').readlines()
    W = len(vocab)

    # Initialize the algorithm with alpha=1/K, eta=1/K, tau_0=1024, kappa=0.7
    olda = onlineldavb.OnlineLDA(vocab, K, D, 1. / K, 1. / K, 1024., 0.7)
    # Run until we've seen D documents. (Feel free to interrupt *much*
    # sooner than this.)
    for iteration in range(0, D):
        # Download some articles
        docset = list()
        docset.append(articles[iteration])
        articlenames = list()
        articlenames.append(artnames[iteration])

        # Give them to online LDA
        (gamma, bound) = olda.update_lambda(docset)
        # Compute an estimate of held-out perplexity
        (wordids, wordcts) = onlineldavb.parse_doc_list(docset, olda._vocab)

        print bound

        perwordbound = bound * len(docset) / (D * sum(map(sum, wordcts)))
        print '%d:  rho_t = %f,  held-out perplexity estimate = %f' % \
            (iteration, olda._rhot, numpy.exp(-perwordbound))

        # Save lambda, the parameters to the variational distributions
        # over topics, and gamma, the parameters to the variational
        # distributions over topic weights for the articles analyzed in
        # the last iteration.
        numpy.savetxt('./simpleLDA/gamma-%d.dat' % iteration, gamma)
        if (iteration % 50 == 0 or iteration == 616):
            numpy.savetxt('./simpleLDA/lambda-%d.dat' % iteration,
                          olda._lambda)
def main():
    """
    Downloads and analyzes a bunch of random Wikipedia articles using
    online VB for LDA.
    """

    # The number of documents to analyze each iteration
    batchsize = 100
    # The total number of documents in Wikipedia
    D = 1000
    # The number of topics
    K = 100

    # How many documents to look at
    documentstoanalyze = int(D / batchsize)

    # Our vocabulary
    vocab = file('./com_all_words.txt').readlines()
    W = len(vocab)

    # Initialize the algorithm with alpha=1/K, eta=1/K, tau_0=1024, kappa=0.7
    olda = onlineldavb.OnlineLDA(vocab, K, D, 1. / K, 1. / K, 1024., 0.7)
    # Run until we've seen D documents. (Feel free to interrupt *much*
    # sooner than this.)
    for iteration in range(0, documentstoanalyze):
        # Download some articles
        docset = []
        counts = []
        linecache.clearcache()
        startpoint = iteration * batchsize + 1
        # get the paper keywords in batches
        for i in range(batchsize):
            f1 = open('com_all_key.txt', 'r')
            f2 = open('com_all.txt', 'r')
            docset.append(
                linecache.getline('com_all_key.txt', min(D,
                                                         startpoint + i))[:-1])
            counts.append(
                linecache.getline('com_all.txt', min(D, startpoint + i))[:-1])
        # Give them to online LDA

# print docset[0]
        (gamma, bound) = olda.update_lambda(docset, counts)
        # Compute an estimate of held-out perplexity
        (wordids,
         wordcts) = onlineldavb.parse_doc_list(docset, olda._vocab, counts)
        # print [olda._vocab[x] for x in docset[0].split(';')], wordids[0], wordcts[0]
        perwordbound = bound * len(docset) / (D * sum(map(sum, wordcts)))
        print '%d:  rho_t = %f,  held-out perplexity estimate = %f' % \
            (iteration, olda._rhot, numpy.exp(-perwordbound))

        # Save lambda, the parameters to the variational distributions
        # over topics, and gamma, the parameters to the variational
        # distributions over topic weights for the articles analyzed in
        # the last iteration.
        if (iteration % 10 == 0):
            numpy.savetxt('lambda_paper-%d.dat' % iteration, olda._lambda)
            numpy.savetxt('gamma_paper-%d.dat' % iteration, gamma)
Esempio n. 10
0
def main():

    # The number of documents to analyze each iteration.
    batchsize = args.batchsize
    # The total number of documents in the corpus.
    D = args.num_docs
    # The number of topics.
    K = args.num_topics

    # How many documents to look at
    documentstoanalyze = int(D / batchsize)

    # The vocabulary

    vocab = file(args.vocab_file).readlines()
    W = len(vocab)

    # Initialize the algorithm with alpha=1/K, eta=1/K, tau_0=1024, kappa=0.7
    alpha = 1. / K  # prior on topic weights theta
    eta = 1. / K  # prior on p(w|topic) Beta
    tau_0 = args.tau_0  # learning parameter to downweight early documents
    kappa = args.kappa  # learning parameter; decay factor for influence of batches
    olda = onlineldavb.OnlineLDA(vocab, K, D, alpha, 1. / K, tau_0, kappa)

    dataset_file = open(args.dataset)
    start = time.time()

    for iteration in range(0, documentstoanalyze):
        # Read a batch of articles.
        docset = batch_read(dataset_file, batchsize)
        # Give them to online LDA
        (gamma, bound) = olda.update_lambda(docset)
        # Compute an estimate of held-out perplexity
        (wordids, wordcts) = onlineldavb.parse_doc_list(docset, olda._vocab)

        # Save lambda, the parameters to the variational distributions
        # over topics, and gamma, the parameters to the variational
        # distributions over topic weights for the articles analyzed in
        # the last iteration.
        if (iteration % 10 == 0):
            i = iteration
            pct = round((i * 1.0 / documentstoanalyze) * 100, 2)
            elapsed = int(time.time() - start)
            Printer(
                "Processed {0} batches. ~ {1}% complete. Elapsed time: {2}s".
                format(i, pct, elapsed))
            if (iteration % args.model_out_freq == 0):
                numpy.savetxt(
                    '{0}lambda-{1}.dat'.format(args.outdir, iteration),
                    olda._lambda)
                numpy.savetxt(
                    '{0}gamma-{1}.dat'.format(args.outdir, iteration), gamma)

    numpy.savetxt('{0}lambda-final.dat'.format(args.outdir), olda._lambda)
    numpy.savetxt('{0}gamma-final.dat'.format(args.outdir), gamma)
Esempio n. 11
0
def main():
    """
    Downloads and analyzes a bunch of random Wikipedia articles using
    online VB for LDA.
    """

    # The number of documents to analyze each iteration
    batchsize = 100
    # The total number of documents in Wikipedia
    D = 1000
    # The number of topics
    K = 100

    # How many documents to look at
    documentstoanalyze = int(D/batchsize)

    # Our vocabulary
    vocab = file('./com_all_words.txt').readlines()
    W = len(vocab)

    # Initialize the algorithm with alpha=1/K, eta=1/K, tau_0=1024, kappa=0.7
    olda = onlineldavb.OnlineLDA(vocab, K, D, 1./K, 1./K, 1024., 0.7)
    # Run until we've seen D documents. (Feel free to interrupt *much*
    # sooner than this.)
    for iteration in range(0, documentstoanalyze):
        # Download some articles
        docset = []
	counts = []
	linecache.clearcache()
	startpoint = iteration * batchsize + 1
	# get the paper keywords in batches
	for i in range(batchsize):
	    f1 = open('com_all_key.txt','r')
	    f2 = open('com_all.txt', 'r')
	    docset.append(linecache.getline('com_all_key.txt', min(D, startpoint + i))[:-1])
	    counts.append(linecache.getline('com_all.txt', min(D, startpoint + i))[:-1])
        # Give them to online LDA
	# print docset[0]
        (gamma, bound) = olda.update_lambda(docset, counts)
        # Compute an estimate of held-out perplexity
        (wordids, wordcts) = onlineldavb.parse_doc_list(docset, olda._vocab, counts)
	# print [olda._vocab[x] for x in docset[0].split(';')], wordids[0], wordcts[0]
        perwordbound = bound * len(docset) / (D * sum(map(sum, wordcts)))
        print '%d:  rho_t = %f,  held-out perplexity estimate = %f' % \
            (iteration, olda._rhot, numpy.exp(-perwordbound))

        # Save lambda, the parameters to the variational distributions
        # over topics, and gamma, the parameters to the variational
        # distributions over topic weights for the articles analyzed in
        # the last iteration.
        if (iteration % 10 == 0):
            numpy.savetxt('lambda_paper-%d.dat' % iteration, olda._lambda)
            numpy.savetxt('gamma_paper-%d.dat' % iteration, gamma)
Esempio n. 12
0
def main():
    """
    Downloads and analyzes a bunch of random Wikipedia articles using
    online VB for LDA.
    """

    # The number of documents to analyze each iteration
    batchsize = 64
    # The total number of documents in Wikipedia
    D = 3.3e6
    # The number of topics
    K = 100

    # How many documents to look at
    if (len(sys.argv) < 2):
        documentstoanalyze = int(D/batchsize)
    else:
        documentstoanalyze = int(sys.argv[1])

    # Our vocabulary
    vocab = file('./dictnostops.txt').readlines()
    W = len(vocab)

    # Initialize the algorithm with alpha=1/K, eta=1/K, tau_0=1024, kappa=0.7
    olda = onlineldavb.OnlineLDA(vocab, K, D, 1./K, 1./K, 1024., 0.7)
    # Run until we've seen D documents. (Feel free to interrupt *much*
    # sooner than this.)
    for iteration in range(0, documentstoanalyze):
        # Download some articles
        (docset, articlenames) = \
            wikirandom.get_random_wikipedia_articles(batchsize)
        # Give them to online LDA
        print docset[0]
        print docset[1]
        print docset[2]
        
        (gamma, bound) = olda.update_lambda(docset)
        # Compute an estimate of held-out perplexity
        (wordids, wordcts) = onlineldavb.parse_doc_list(docset, olda._vocab)
        perwordbound = bound * len(docset) / (D * sum(map(sum, wordcts)))
        print '%d:  rho_t = %f,  held-out perplexity estimate = %f' % \
            (iteration, olda._rhot, numpy.exp(-perwordbound))

        # Save lambda, the parameters to the variational distributions
        # over topics, and gamma, the parameters to the variational
        # distributions over topic weights for the articles analyzed in
        # the last iteration.
        if (iteration % 10 == 0):
            numpy.savetxt('lambda-%d.dat' % iteration, olda._lambda)
            numpy.savetxt('gamma-%d.dat' % iteration, gamma)
Esempio n. 13
0
def main():

    # The number of documents to analyze each iteration.
    batchsize = args.batchsize
    # The total number of documents in the corpus.
    D = args.num_docs
    # The number of topics.
    K = args.num_topics

    # How many documents to look at
    documentstoanalyze = int(D/batchsize)

    # The vocabulary
    vocab = file(args.vocab_file).readlines()
    W = len(vocab)

    # Initialize the algorithm with alpha=1/K, eta=1/K, tau_0=1024, kappa=0.7
    alpha = 1./K  # prior on topic weights theta
    eta   = 1./K  # prior on p(w|topic) Beta
    tau_0 = args.tau_0  # learning parameter to downweight early documents
    kappa = args.kappa  # learning parameter; decay factor for influence of batches
    olda = onlineldavb.OnlineLDA(vocab, K, D, alpha, 1./K, tau_0, kappa)

    dataset_file = open(args.dataset)
    start = time.time()

    for iteration in range(0, documentstoanalyze):
        # Read a batch of articles.
        docset = batch_read(dataset_file, batchsize)
        # Give them to online LDA
        (gamma, bound) = olda.update_lambda(docset)
        # Compute an estimate of held-out perplexity
        (wordids, wordcts) = onlineldavb.parse_doc_list(docset, olda._vocab)

        # Save lambda, the parameters to the variational distributions
        # over topics, and gamma, the parameters to the variational
        # distributions over topic weights for the articles analyzed in
        # the last iteration.
        if (iteration % 10 == 0):
            i = iteration
            pct = round((i * 1.0 / documentstoanalyze) * 100, 2)
            elapsed = int(time.time() - start)
            Printer("Processed {0} batches. ~ {1}% complete. Elapsed time: {2}s"
                .format(i, pct, elapsed))
            if (iteration % args.model_out_freq == 0):
                numpy.savetxt('{0}lambda-{1}.dat'.format(args.outdir, iteration), olda._lambda)
                numpy.savetxt('{0}gamma-{1}.dat'.format(args.outdir, iteration), gamma)

    numpy.savetxt('{0}lambda-final.dat'.format(args.outdir), olda._lambda)
    numpy.savetxt('{0}gamma-final.dat'.format(args.outdir), gamma)
Esempio n. 14
0
def main():
    """
    Downloads and analyzes a bunch of random Wikipedia articles using
    online VB for LDA.
    """

    # The number of documents to analyze each iteration
    batchsize = 64
    # The total number of questions on Stack Overflow
    D = 3.3e6
    # The number of topics
    K = 50

    # How many documents to look at
    if (len(sys.argv) < 2):
        documentstoanalyze = int(D / batchsize)
    else:
        documentstoanalyze = int(sys.argv[1])

    # Our vocabulary
    vocab = file('./vocab2.txt').readlines()
    W = len(vocab)

    # Our set of questions from Stack Overflow
    questions = QuestionSet(datafilename)

    # Initialize the algorithm with alpha=1/K, eta=1/K, tau_0=1024, kappa=0.7
    olda = onlineldavb.OnlineLDA(vocab, K, D, 1. / K, 1. / K, 1024., 0.7)
    # Run until we've seen D documents. (Feel free to interrupt *much*
    # sooner than this.)
    print 'processing', documentstoanalyze
    for iteration in range(0, documentstoanalyze):
        # Download some articles
        (docset, articlenames) = questions.get_batch(batchsize)
        # Give them to online LDA
        (gamma, bound) = olda.update_lambda(docset)
        # Compute an estimate of held-out perplexity
        (wordids, wordcts) = onlineldavb.parse_doc_list(docset, olda._vocab)
        perwordbound = bound * len(docset) / (D * sum(map(sum, wordcts)))
        print '%d:  rho_t = %f,  held-out perplexity estimate = %f' % \
            (iteration, olda._rhot, numpy.exp(-perwordbound))

        # Save lambda, the parameters to the variational distributions
        # over topics, and gamma, the parameters to the variational
        # distributions over topic weights for the articles analyzed in
        # the last iteration.
        if (iteration % 10 == 0):
            numpy.savetxt('lambda-%d.dat' % iteration, olda._lambda)
            numpy.savetxt('gamma-%d.dat' % iteration, gamma)
def updateLda(olda, docset, gamma=None, nIts=10):
    
    D = olda._D
    
    for iteration in range(0, nIts):

        # Run online LDA with current document set
        (gamma, bound) = olda.update_lambda(docset, gamma)
        # Compute an estimate of held-out perplexity
        (_, wordcts) = onlineldavb.parse_doc_list(docset, olda._vocab)
        perwordbound = bound * len(docset) / (D * sum(map(sum, wordcts)))
        print '%d:  held-out perplexity estimate = %f' % \
            (iteration, numpy.exp(-perwordbound))

    return (olda._lambda, gamma, olda)
Esempio n. 16
0
def main():
    """
    Loads and analyzes tweets
    """

    # The number of documents to analyze each iteration
    batchsize = 1000
    # The total number of documents in Wikipedia
    D = 1.0e6
    # The number of topics
    K = 50

    # How many documents to look at
    if (len(sys.argv) < 2):
        documentstoanalyze = int(D/batchsize)
    else:
        documentstoanalyze = int(sys.argv[1])

    # Our vocabulary
    vocab = file('./dictnostops.txt').readlines()
    W = len(vocab)
    
    # Load documents
    tweets = file('./tweets_linebyline.txt').readlines()

    # Initialize the algorithm with alpha=1/K, eta=1/K, tau_0=1024, kappa=0.7
    olda = onlineldavb.OnlineLDA(vocab, K, D, 1./K, 1./K, 1024., 0.9)
    # Run until we've seen D documents. (Feel free to interrupt *much*
    # sooner than this.)
    for iteration in range(0, documentstoanalyze):
        
        # Give some documents to online LDA
        start = iteration*batchsize
        end = (iteration+1)*batchsize - 1
        (gamma, bound) = olda.update_lambda(tweets[start:end])
        # Compute an estimate of held-out perplexity
        (wordids, wordcts) = onlineldavb.parse_doc_list(tweets[start:end], olda._vocab)
        perwordbound = bound * len(tweets[start:end]) / (D * sum(map(sum, wordcts)))
        print '%d (%d - %d):  rho_t = %f,  held-out perplexity estimate = \t %f' % \
            (iteration, start, end, olda._rhot, numpy.exp(-perwordbound))

        # Save lambda, the parameters to the variational distributions
        # over topics, and gamma, the parameters to the variational
        # distributions over topic weights for the articles analyzed in
        # the last iteration.
        if (iteration % 10 == 0):
            numpy.savetxt('data_lda/lambda-%d.dat' % iteration, olda._lambda)
            numpy.savetxt('data_lda/gamma-%d.dat' % iteration, gamma)
Esempio n. 17
0
def main():
    """
    Downloads and analyzes a bunch of random Wikipedia articles using
    online VB for LDA.
    """

    # The number of documents to analyze each iteration
    batchsize = 64
    # The total number of documents in Wikipedia
    D = 3.3e6
    # The number of topics
    K = 100

    # How many documents to look at
    if len(argv) < 2:
        documentstoanalyze = int(D/batchsize)
    else:
        documentstoanalyze = int(argv[1])

    # Our vocabulary
    vocab = file('./dictnostops.txt').readlines()
    W = len(vocab)

    # Initialize the algorithm with alpha=1/K, eta=1/K, tau_0=1024, kappa=0.7
    olda = onlineldavb.OnlineLDA(vocab, K, D, 1./K, 1./K, 1024., 0.7)
    # Run until we've seen D documents. (Feel free to interrupt *much*
    # sooner than this.)
    for iteration in range(0, documentstoanalyze):
        # Download some articles
        docset, articlenames = \
            wikirandom.get_random_wikipedia_articles(batchsize)
        # Give them to online LDA
        gamma, bound = olda.update_lambda(docset)
        # Compute an estimate of held-out perplexity
        wordids, wordcts = onlineldavb.parse_doc_list(docset, olda._vocab)
        perwordbound = bound * len(docset) / (D * sum(map(sum, wordcts)))
        print '%d:  rho_t = %f,  held-out perplexity estimate = %f' % \
            (iteration, olda._rhot, numpy.exp(-perwordbound))

        # Save lambda, the parameters to the variational distributions
        # over topics, and gamma, the parameters to the variational
        # distributions over topic weights for the articles analyzed in
        # the last iteration.
        if iteration % 10 == 0:
            print "Iteration: ", iteration
            numpy.savetxt('lambda.dat', olda._lambda)
            numpy.savetxt('gamma.dat', gamma)
Esempio n. 18
0
def main():
    """
    Downloads and analyzes a bunch of random Wikipedia articles using
    online VB for LDA.
    """
    
    doc_files=sys.argv[1]
    
    (docset, articlenames) = \
        load_documents(doc_files)
        
    D=len(docset) 
    
    # of topics
    K = int(sys.argv[2])
    
    # Our vocabulary
    vocab = file('./dictnostops_test.txt').readlines()
    W = len(vocab)

    # Initialize the algorithm with alpha=1/K, eta=1/K, tau_0=1024, kappa=0.7
    '''kappa set to 0 to eliminate decay'''
    olda = onlineldavb.OnlineLDA(vocab, K, D, 1./K, 1./K, 1024., 0)
    # Run until we've seen D documents. (Feel free to interrupt *much*
    # sooner than this.)
    
   
   
    # Give them to online LDA
    (gamma, bound) = olda.update_lambda(docset)
    # Compute an estimate of held-out perplexity
    (wordids, wordcts) = onlineldavb.parse_doc_list(docset, olda._vocab)
    perwordbound = bound * len(docset) / (D * sum(map(sum, wordcts)))
    print '  rho_t = %f,  held-out perplexity estimate = %f' % \
        ( olda._rhot, numpy.exp(-perwordbound))

    # Save lambda, the parameters to the variational distributions
    # over topics, and gamma, the parameters to the variational
    # distributions over topic weights for the articles analyzed in
    # the last iteration.
    print (olda._lambda.shape)
    print (gamma.shape)
    
    numpy.savetxt('lambda.dat',  olda._lambda)
    numpy.savetxt('gamma.dat',   gamma)
Esempio n. 19
0
def make_topic_columns(lda, data, K, D, batchsize):
    questions = QuestionSet(data)
    allgamma = numpy.zeros((len(data), K))
    for iteration in range(0, len(data) / batchsize):
        start = iteration * batchsize
        end = start + batchsize
        # Download some articles
        (docset, articlenames) = questions.get_batch(start, end)
        # Give them to online LDA
        (gamma, bound) = lda.update_lambda(docset)
        allgamma[start:end, :] = gamma
        # Compute an estimate of held-out perplexity
        (wordids, wordcts) = parse_doc_list(docset, lda._vocab)
        perwordbound = bound * len(docset) / (D * sum(map(sum, wordcts)))
        print "%d:  rho_t = %f,  held-out perplexity estimate = %f" % (iteration, lda._rhot, numpy.exp(-perwordbound))
    # copy to dataframe
    for k in range(K):
        data["Topic%d" % k] = allgamma[:, k]
Esempio n. 20
0
def make_topic_columns(lda, data, K, D, batchsize):
    questions = QuestionSet(data)
    allgamma = numpy.zeros((len(data), K))
    for iteration in range(0, len(data) / batchsize):
        start = iteration * batchsize
        end = start + batchsize
        # Download some articles
        (docset, articlenames) = questions.get_batch(start, end)
        # Give them to online LDA
        (gamma, bound) = lda.update_lambda(docset)
        allgamma[start:end,:] = gamma
        # Compute an estimate of held-out perplexity
        (wordids, wordcts) = parse_doc_list(docset, lda._vocab)
        perwordbound = bound * len(docset) / (D * sum(map(sum, wordcts)))
        print '%d:  rho_t = %f,  held-out perplexity estimate = %f' % \
            (iteration, lda._rhot, numpy.exp(-perwordbound))
    # copy to dataframe
    for k in range(K):
        data['Topic%d'%k] = allgamma[:,k]
Esempio n. 21
0
def main():
    """
    Downloads and analyzes a bunch of random Wikipedia articles using
    online VB for LDA.
    """

    doc_files = sys.argv[1]

    (docset, articlenames) = \
        load_documents(doc_files)

    D = len(docset)

    # of topics
    K = int(sys.argv[2])

    # Our vocabulary
    vocab = file('./dictnostops_test.txt').readlines()
    W = len(vocab)

    # Initialize the algorithm with alpha=1/K, eta=1/K, tau_0=1024, kappa=0.7
    '''kappa set to 0 to eliminate decay'''
    olda = onlineldavb.OnlineLDA(vocab, K, D, 1. / K, 1. / K, 1024., 0)
    # Run until we've seen D documents. (Feel free to interrupt *much*
    # sooner than this.)

    # Give them to online LDA
    (gamma, bound) = olda.update_lambda(docset)
    # Compute an estimate of held-out perplexity
    (wordids, wordcts) = onlineldavb.parse_doc_list(docset, olda._vocab)
    perwordbound = bound * len(docset) / (D * sum(map(sum, wordcts)))
    print '  rho_t = %f,  held-out perplexity estimate = %f' % \
        ( olda._rhot, numpy.exp(-perwordbound))

    # Save lambda, the parameters to the variational distributions
    # over topics, and gamma, the parameters to the variational
    # distributions over topic weights for the articles analyzed in
    # the last iteration.
    print(olda._lambda.shape)
    print(gamma.shape)

    numpy.savetxt('lambda.dat', olda._lambda)
    numpy.savetxt('gamma.dat', gamma)
Esempio n. 22
0
def main(num_batches, K):
    """
    Downloads and analyzes a bunch of random Wikipedia articles using
    online VB for LDA.

    Arguments:
    - num_batches: the number of batchs to take corpus_size = num_batches * batch_size
    - K : the number of topics, determined from stdin
    """

    # The number of documents to analyze each iteration
    batchsize = 64
    # The total number of documents in Wikipedia
    D = 3.3e6

    # Our vocabulary
    vocab = file('./dictnostops.txt').readlines()
    W = len(vocab)

    # Initialize the algorithm with alpha=1/K, eta=1/K, tau_0=1024, kappa=0.7
    olda = onlineldavb.OnlineLDA(vocab, K, D, 1. / K, 1. / K, 1024., 0.7)
    # Run until we've seen D documents. (Feel free to interrupt *much*
    # sooner than this.)
    for iteration in range(0, num_batches):
        # Download some articles
        (docset, articlenames) = \
            wikirandom.get_random_wikipedia_articles(batchsize)
        # Give them to online LDA
        (gamma, bound) = olda.update_lambda(docset)
        # Compute an estimate of held-out perplexity
        (wordids, wordcts) = onlineldavb.parse_doc_list(docset, olda._vocab)
        perwordbound = bound * len(docset) / (D * sum(map(sum, wordcts)))
        print '%d:  rho_t = %f,  held-out perplexity estimate = %f' % \
            (iteration, olda._rhot, numpy.exp(-perwordbound))

        # Save lambda, the parameters to the variational distributions
        # over topics, and gamma, the parameters to the variational
        # distributions over topic weights for the articles analyzed in
        # the last iteration.
        if (iteration % 10 == 0):
            numpy.savetxt('lambda-%d.dat' % iteration, olda._lambda)
            numpy.savetxt('gamma-%d.dat' % iteration, gamma)
def main():
    '''
    Read PApers
    '''
    papers_ = []

    with open('papers.csv', 'r') as csvfile:
        for line in csv.reader(csvfile, delimiter=',', quotechar='"'):
            papers_.append(line)

    D = len(papers_)

    # The number of topics
    K = 10

    # Our vocabulary
    vocab = open('./dictnostops.txt').readlines()

    # Initialize the algorithm with alpha=1/K, eta=1/K, tau_0=1024, kappa=0.7
    olda = onlineldavb.OnlineLDA(vocab, K, D, 1. / K, 1. / K, 1024., 0.7)

    docset = [row[3] for row in papers_]
    #articlenames = [row[0] for row in papers_]

    # Give them to online LDA
    (gamma, bound) = olda.update_lambda_docs(docset)
    # Compute an estimate of held-out perplexity
    (wordids, wordcts) = onlineldavb.parse_doc_list(docset, olda._vocab)
    perwordbound = bound * len(docset) / (D * sum(map(sum, wordcts)))
    print('%d:  rho_t = %f,  held-out perplexity estimate = %f' % \
        (1, olda._rhot, numpy.exp(-perwordbound)))

    # Save lambda, the parameters to the variational distributions
    # over topics, and gamma, the parameters to the variational
    # distributions over topic weights for the articles analyzed in
    # the last iteration.

    numpy.savetxt('lambda.dat', olda._lambda)
    numpy.savetxt('gamma.dat', gamma)

    #show topics
    printtopics.main(5)
def main():
    """
	using online VB for LDA on Archive data.
    """

    # The number of documents to analyze each iteration
    batchsize = 1000
    # The total number of documents in Wikipedia
    D = 7000
    # The number of topics
    K = 10
    # How many documents to look at
    documentstoanalyze = int(D/batchsize)
    if (len(sys.argv) > 1):
    	K = int(sys.argv[1])

    # Our vocabulary
    vocab = file(data_dir+'/dictionary_all.txt').readlines()
    W = len(vocab)

    # Initialize the algorithm with alpha=1/K, eta=1/K, tau_0=1024, kappa=0.7
    olda = onlineldavb.OnlineLDA(vocab, K, D, 1./K, 1./K, 1024., 0.7)
    # Run until we've seen D documents. (Feel free to interrupt *much*
    # sooner than this.)
    for iteration in range(0, documentstoanalyze):
        docset= get_abstracts(iteration)
        # Give them to online LDA
        (gamma, bound) = olda.update_lambda(docset)
        # Compute an estimate of held-out perplexity
        (wordids, wordcts) = onlineldavb.parse_doc_list(docset, olda._vocab)
        perwordbound = bound * len(docset) / (D * sum(map(sum, wordcts)))
        print '%d:  rho_t = %f,  held-out perplexity estimate = %f' % \
            (iteration, olda._rhot, numpy.exp(-perwordbound))

        # Save lambda, the parameters to the variational distributions
        # over topics, and gamma, the parameters to the variational
        # distributions over topic weights for the articles analyzed in
        # the last iteration.
    	numpy.savetxt('%darchive-lambda-%d.dat' % (K, iteration), olda._lambda)
    	numpy.savetxt('%darchive-gamma-%d.dat' % (K, iteration), gamma)
Esempio n. 25
0
def allocate_topics(lda, data, K, batchsize, D):
    n_iterations = len(data) / batchsize
    questions = QuestionSet(data)
    topics = numpy.zeros((len(data), K))

    # derive topics from data in batches
    for iteration in range(0, n_iterations):
        start = iteration * batchsize
        end = start + batchsize
        (docset, _) = questions.get_batch(start, end)
        (gamma, bound) = lda.update_lambda(docset)
        topics[start:end,:] = gamma
        (wordids, wordcts) = onlineldavb.parse_doc_list(docset, lda._vocab)
        perwordbound = bound * len(docset) / (D * sum(map(sum, wordcts)))
        print '%d:  rho_t = %f,  held-out perplexity estimate = %f' % \
            (iteration, lda._rhot, numpy.exp(-perwordbound))

    # copy to dataframe
    for k in range(K):
        data['Topic%d'%k] = topics[:,k]

    return topics
Esempio n. 26
0
def main():
    """
    Downloads and analyzes a bunch of random Wikipedia articles using
    online VB for LDA.
    """
    global D
    global doc_list
    global last_gamma_file
    cut_words()
    print D
    print len(doc_list)
    # The number of documents to analyze each iteration
    batchsize = 64
    # The total number of documents in Wikipedia
    #D = 500
    # The number of topics
    K = int(sys.argv[1])

    # How many documents to look at
    if (len(sys.argv) < 3):
        documentstoanalyze = int(D/batchsize)
    else:
        documentstoanalyze = int(sys.argv[1])

    # Our vocabulary
    vocab = file('./chineseNoStopWords.txt').readlines()
    #print vocab
    W = len(vocab)

    # Initialize the algorithm with alpha=1/K, eta=1/K, tau_0=1024, kappa=0.7
    olda = onlineldavb.OnlineLDA(vocab, K, D, 1./K, 1./K, 1024., 0.7)
    # Run until we've seen D documents. (Feel free to interrupt *much*
    # sooner than this.)
    print documentstoanalyze
    perplexity_set = []
    iter_set = []
    for iteration in range(0, documentstoanalyze):
        # Download some articles
        '''
        (docset, articlenames) = \
            wikirandom.get_random_wikipedia_articles(batchsize)
        '''
        docset = doc_list[iteration*batchsize:(iteration+1)*batchsize]
        # Give them to online LDA
        (gamma, bound) = olda.update_lambda(docset)
        # Compute an estimate of held-out perplexity
        (wordids, wordcts) = onlineldavb.parse_doc_list(docset, olda._vocab)
        perwordbound = bound * len(docset) / (D * sum(map(sum, wordcts)))
        print '%d:  rho_t = %f,  held-out perplexity estimate = %f' % \
            (iteration, olda._rhot, numpy.exp(-perwordbound))
        perplexity_set.append(numpy.exp(-perwordbound))
        iter_set.append(iteration)
        # Save lambda, the parameters to the variational distributions
        # over topics, and gamma, the parameters to the variational
        # distributions over topic weights for the articles analyzed in
        # the last iteration.
        if (iteration % 100 == 0 or iteration==documentstoanalyze-1):
            numpy.savetxt('./res_'+sys.argv[1]+'/lambda-%d.dat' % iteration, olda._lambda)
            numpy.savetxt('./res_'+sys.argv[1]+'/gamma-%d.dat' % iteration, gamma)
    last_gamma_file = './res_'+sys.argv[1]+'/lambda-%d.dat'%(documentstoanalyze-1)
    save_lambda_path = 'last_lambda_'+sys.argv[1]+'.txt'
    flast = open(save_lambda_path,'w')
    flast.write(last_gamma_file)
    flast.close()
Esempio n. 27
0
def fit_olda_liveparse(doc_path, vocab_file, outdir, K, batch_size, iterations,\
    verbose_topics, anchors, tmv_pickle, lemmatize):
    """
    Analyzes a set of documents using online VB for LDA.
    """
    # instance to get random documents
    docgen = generalrandom.LiveparseDocGen(doc_path)

    # The total number of documents in Wikipedia
    D = docgen.getDocCount()

    # Our vocabulary
    vocab = [term.strip() for term in file(vocab_file).readlines()]
    W = len(vocab)

    # write out general settings to pickle file for use by TMV later
    if tmv_pickle:
        # save model settings: vocab, K, docgen
        f = open(join(outdir, 'settings.pickle'), 'w+')
        cPickle.dump((vocab, K, docgen), f)
        f.close()


    # Initialize the algorithm with alpha=1/K, eta=1/K, tau_0=1024, kappa=0.7
    olda = onlineldavb.OnlineLDA(vocab, K, D, 1./K, 1./K, 1024., 0.7, anchors, \
        lem = lemmatize)
    # Run until we've seen D documents. (Feel free to interrupt *much*
    # sooner than this.)
    iteration = 0
    old_perplexity = 1.0 * sys.maxint
    delta_perplexity = 1.0 * sys.maxint
    delta_perplexities = [old_perplexity] * 10
    logfile = open(join(outdir, 'log.out'), 'w+')
    while (iterations != 0 and iteration < iterations) or \
        sum(delta_perplexities)/10 > 0.001: # 0.1% change in sample perplexity
        if iteration > D/batch_size:
            print "killing due to iteration count"
            break

        iter_start = time.time()
        # Download some articles
        docset = docgen.get_random_articles(batch_size)

        # Give them to online LDA
        (gamma, bound) = olda.update_lambda(docset)

        # Compute an estimate of held-out perplexity
        (wordids, wordcts) = onlineldavb.parse_doc_list(docset, olda._vocab, \
            lemmatize)

        # estimate perpexity with the current batch
        perwordbound = bound * len(docset) / (D * sum(map(sum, wordcts)))
        perplexity = numpy.exp(-perwordbound)
        delta_perplexity = abs(old_perplexity - perplexity) / perplexity
        print '%d:  rho_t = %f,  held-out perplexity estimate = %f (%.2f%%)' % \
            (iteration, olda._rhot, perplexity, delta_perplexity * 100)
        logfile.write('%d:  rho_t = %f,  held-out perplexity estimate = %f (%.2f%%)\n' % (iteration, olda._rhot, perplexity, delta_perplexity * 100))
        old_perplexity = perplexity
        delta_perplexities.pop(0)
        delta_perplexities.append(delta_perplexity)

        # Save lambda, the parameters to the variational distributions
        # over topics, and gamma, the parameters to the variational
        # distributions over topic weights for the articles analyzed in
        # the last iteration.
        if (iteration % 10 == 0):
            numpy.savetxt(join(outdir, 'lambda-%d.dat' % iteration), \
                olda._lambda)
            numpy.savetxt(join(outdir, 'gamma-%d.dat' % iteration), gamma)


            if verbose_topics:
                print_topics(K, 7, vocab, olda._lambda, anchors)


        iteration += 1

    if tmv_pickle:
        f = open(join(outdir,'olda.pickle'), 'w+')
        cPickle.dump(olda, f)
        f.close()
Esempio n. 28
0
def main():

    # unpack input arguments
    # seednum = 1
    # documentstoanalyze  = 2000
    # batchsize = 10
    # priv = 1
    # epsilon = 1
    # comp = 2
    # mech = 0

    seednum = int(sys.argv[1])
    documentstoanalyze = int(sys.argv[2])
    batchsize = int(sys.argv[3])
    priv = int(sys.argv[4])  # 1 is private version, 0 is nonprivate version
    # epsilon = float(sys.argv[5]) # total privacy budget
    comp = int(sys.argv[5])  #
    mech = int(sys.argv[6])  # 0 for Gaussian, 1 for Laplace

    # The number of topics
    #K = 100
    K = 50  #JF

    # load data
    # the_filename = Data_PATH+'wiki_docsmallset'
    # with open(the_filename, 'rb') as f:
    #     docset = cPickle.load(f)

    #the_filename = Data_PATH+'wiki_docsmallset_D=%s' %(400000)
    the_filename = os.path.join(Data_PATH, 'wiki_docsmallset_D=%s' %
                                (400000))  #JF: Make this work on Windows
    if resampleShortDocs:
        the_filename = the_filename + '_resample_short_docs'
    with open(the_filename, 'rb') as f:
        docset = cPickle.load(f)

    D = len(docset)
    print 'document length: %s' % (D)

    nu = batchsize / float(D)  # sampling rate
    numpy.random.seed(seednum)

    print 'seednum %s mini-batchsize %s and number of iter %s' % (
        seednum, batchsize, documentstoanalyze)

    # Our vocabulary
    vocab = file('./dictnostops.txt').readlines()
    W = len(vocab)
    """ privacy budget calculation """
    # (1) to set the same level of burned privacy, we first calculate MA composition
    #sigma = 1.00000000000000000000000000000000000001 #a small value to minimize the noise
    #sigma = 1.1  #an intermediate value
    sigma = 1.24  #an intermediate value
    #sigma = 1.5  #an intermediate value
    #sigma = 2 #a larger value, expected to substantially reduce privacy and performance.

    total_del = 1e-4
    J = documentstoanalyze
    total_eps_MA = cal_pri.moments_accountant(sigma, total_del, nu, J)
    print 'total privacy loss is %f' % (total_eps_MA)

    #(2) strong composition
    del_iter = 1e-6
    res = minimize_scalar(cal_pri.strong_composition,
                          bounds=(0, 50),
                          args=(total_eps_MA, total_del, J, nu, del_iter),
                          method='bounded')
    eps_iter = res.x

    gamma_noise = 0  # we don't use this at all.

    if comp == 0:  #MA
        c2 = 2 * np.log(1.25 / del_iter)
        eps_iter = np.sqrt(c2) / sigma
        budget = [eps_iter, del_iter]

    elif comp == 1:  #strong composition
        budget = [eps_iter, del_iter]
    else:
        print "we don't support this composition"

    if priv:
        print 'private version'

    olda = onlineldavb.OnlineLDA(vocab, K, D, 1. / K, 1. / K, 1024., 0.7, priv,
                                 budget, gamma_noise, mech)

    perplexity = numpy.zeros(documentstoanalyze)

    # for iteration in range(0, maxIter):
    for iteration in range(0, documentstoanalyze):
        # subset of data
        rand_perm_nums = numpy.random.permutation(len(docset))
        idx_minibatch = rand_perm_nums[0:batchsize]
        docsubset = list(docset[i] for i in idx_minibatch)

        # Give them to online LDA
        (gamma, bound) = olda.update_lambda_docs(docsubset)
        # Compute an estimate of held-out perplexity
        (wordids, wordcts) = onlineldavb.parse_doc_list(docsubset, olda._vocab)
        perwordbound = bound * len(docsubset) / (D * sum(map(sum, wordcts)))
        print '%d:  rho_t = %f,  held-out perplexity estimate = %f' % \
            (iteration, olda._rhot, numpy.exp(-perwordbound))

        perplexity[iteration] = numpy.exp(-perwordbound)

    # save perplexity
    if priv:
        # if gamma_noise:
        #     method = 'private_epsilon_%s_cdp_%s_gamma_noise_%s' % (epsilon, cdp, gamma_noise)
        # else:
        #     method = 'private_epsilon_%s_cdp_%s' %(epsilon, cdp)
        # method = 'static_private_seed=%s_J=%s_S=%s_priv=%s_epsilon=%s_compo=%s_D=%s' %(sys.argv[1], sys.argv[2], sys.argv[3], sys.argv[4], sys.argv[5], sys.argv[6], D)

        #method = Results_PATH+'static_private_seed=%s_J=%s_S=%s_priv=%s_epsilon=%s_compo=%s_Lap=%s_D=%s' %(sys.argv[1], sys.argv[2], sys.argv[3], sys.argv[4], total_eps_MA, sys.argv[5], sys.argv[6], D)
        method = os.path.join(
            Results_PATH,
            'static_private_seed=%s_J=%s_S=%s_priv=%s_epsilon=%s_compo=%s_Lap=%s_D=%s'
            % (sys.argv[1], sys.argv[2], sys.argv[3], sys.argv[4],
               total_eps_MA, sys.argv[5], sys.argv[6], D))

    else:
        #method = Results_PATH+'static_nonprivate_seed=%s_J=%s_S=%s_priv=%s_D=%s' %(sys.argv[1], sys.argv[2], sys.argv[3], sys.argv[4], D)
        method = os.path.join(
            Results_PATH, 'static_nonprivate_seed=%s_J=%s_S=%s_priv=%s_D=%s' %
            (sys.argv[1], sys.argv[2], sys.argv[3], sys.argv[4], D))
    if resampleShortDocs:
        method = method + '_resample_short_docs'
    numpy.save(method + '.npy', perplexity)
    # method = 'private_epsilon_1'
    # filename = method+'_D=_%s_S=_%s' %(D, batchsize)
    # numpy.save(filename+'.npy', test_log_likelihood)

    # save lambda and gamma
    numpy.savetxt(method + '_lambda.dat', olda._lambda)
    numpy.savetxt(method + '_gamma.dat', gamma)
Esempio n. 29
0
def main():
    # unpack input arguments
    # seednum = 1
    # documentstoanalyze  = 2000
    # batchsize = 1000
    # priv = 0
    # epsilon = 1
    # comp = 2

    seednum = int(sys.argv[1])
    documentstoanalyze = int(sys.argv[2])
    batchsize = int(sys.argv[3])
    priv = int(sys.argv[4])  # 1 is private version, 0 is nonprivate version
    epsilon = float(sys.argv[5])  # total privacy budget
    comp = int(sys.argv[6])  # 0 conventional, 1 advanced, 2 CDP

    # The number of topics
    K = 100
    # D = 1000000
    D = 5000000

    nu = batchsize / float(D)  # sampling rate
    numpy.random.seed(seednum)

    print('seednum %s mini-batchsize %s and number of iter %s' %
          (seednum, batchsize, documentstoanalyze))

    # Our vocabulary
    vocab = file('./dictnostops.txt').readlines()
    W = len(vocab)

    gamma_noise = 0  # will use Laplace noise all the time

    if comp == 2:
        # budget = numpy.sqrt(epsilon/float(documentstoanalyze))
        # budget = numpy.sqrt(epsilon*D/float(2*batchsize))
        budget = numpy.sqrt(2 * epsilon) / float(
            2 * nu * numpy.sqrt(documentstoanalyze))
    elif comp == 1:
        delta = 0.000001
        budget = epsilon / float(
            4 * nu * numpy.sqrt(2 * documentstoanalyze * numpy.log(1 / delta)))
    else:
        # budget = epsilon/float(documentstoanalyze)
        budget = epsilon / float(2 * documentstoanalyze * nu)

    if priv:
        print('private version')

    olda = onlineldavb.OnlineLDA(vocab, K, D, 1. / K, 1. / K, 1024., 0.7, priv,
                                 budget, gamma_noise)

    # the_filename = Data_PATH+'wiki_data'
    # with open(the_filename, 'rb') as f:
    #     docset = cPickle.load(f)

    # load all the documents
    # docset = []
    # for whichdoc in range(1, 21):
    #     the_filename = Data_PATH+'wikidata_seednum=_%s' %(whichdoc)
    #     with open(the_filename, 'rb') as f:
    #         docset1 = cPickle.load(f)
    #         docset = docset + docset1
    #         print "docset %s is loaded" %(whichdoc)
    #
    # print "docset all loaded"

    perplexity = numpy.zeros(documentstoanalyze)
    # D_test = 10000

    # for iteration in range(0, maxIter):
    for iteration in range(0, documentstoanalyze):
        # subset of data
        # rand_perm_nums =  numpy.random.permutation(len(docset))
        # idx_minibatch = rand_perm_nums[0:batchsize]
        # docsubset = list(docset[i] for i in idx_minibatch)

        # Download some articles
        (docset, articlenames) = \
         wikirandom.get_random_wikipedia_articles(batchsize)
        # Give them to online LDA
        (gamma, bound) = olda.update_lambda_docs(docset)
        # Compute an estimate of held-out perplexity
        (wordids, wordcts) = onlineldavb.parse_doc_list(docset, olda._vocab)
        perwordbound = bound * len(docset) / (D * sum(map(sum, wordcts)))
        print('%d:  rho_t = %f,  held-out perplexity estimate = %f' % \
              (iteration, olda._rhot, numpy.exp(-perwordbound)))

        # # Give them to online LDA
        # (gamma, bound) = olda.update_lambda_docs(docsubset)
        # # Compute an estimate of held-out perplexity
        # (wordids, wordcts) = onlineldavb.parse_doc_list(docsubset, olda._vocab)
        # perwordbound = bound * len(docsubset) / (D * sum(map(sum, wordcts)))
        # print '%d:  rho_t = %f,  training perplexity estimate = %f' % \
        #     (iteration, olda._rhot, numpy.exp(-perwordbound))

        # compute test perplexity
        # idx_test = rand_perm_nums[batchsize+1:batchsize+1+D_test]
        # doctest = list(docset[i] for i in idx_test)
        #
        # (gamma_test, ss) = olda.do_e_step_docs(doctest)
        # # Estimate held-out likelihood for current values of lambda.
        # bound_test = olda.approx_bound_docs(doctest, gamma_test)
        # (wordids, wordcts_test) = onlineldavb.parse_doc_list(doctest, olda._vocab)
        #
        # # perwordbound_test = bound_test*D_test / float(D*sum(map(sum, wordcts_test)))
        # perword_test_log_likelihood = bound_test / float(sum(map(sum, wordcts_test)))
        # print '%d:  rho_t = %f,  test perplexity estimate = %f' % \
        #     (iteration, olda._rhot, perword_test_log_likelihood)

        perplexity[iteration] = numpy.exp(-perwordbound)

    # save perplexity
    if priv:
        # if gamma_noise:
        #     method = 'private_epsilon_%s_cdp_%s_gamma_noise_%s' % (epsilon, cdp, gamma_noise)
        # else:
        #     method = 'private_epsilon_%s_cdp_%s' %(epsilon, cdp)
        method = 'private_seed=%s_J=%s_S=%s_priv=%s_epsilon=%s_compo=%s' % (
            sys.argv[1], sys.argv[2], sys.argv[3], sys.argv[4], sys.argv[5],
            sys.argv[6])
    else:
        method = 'Nonprivate_seed=%s_J=%s_S=%s_priv=%s_epsilon=%s_compo=%s' % (
            sys.argv[1], sys.argv[2], sys.argv[3], sys.argv[4], sys.argv[5],
            sys.argv[6])

    numpy.save(method + '.npy', perplexity)
    # method = 'private_epsilon_1'
    # filename = method+'_D=_%s_S=_%s' %(D, batchsize)
    # numpy.save(filename+'.npy', test_log_likelihood)

    # save lambda and gamma
    numpy.savetxt(method + '_lambda.dat', olda._lambda)
    numpy.savetxt(method + '_gamma.dat', gamma)
def main():
    """
    Downloads and analyzes a bunch of random Wikipedia articles using
    online VB for LDA.
    """
    # The number of documents to analyze each iteration
    batch_size = 4

    # Total number of documents in the population. For a fixed corpus,
    # this is the size of the corpus. In the truly online setting

    number_of_documents = 71

    # The number of topics
    number_of_topics = 1

    # establish mysql database connection
    database = MysqlMessager(database="keyword_app")
    sql = "select Abstract from PreprocessedAbstracts;"
    database.excute_sql(sql)
    row_iteration = database.fetch()
    abstracts = [row[0] for row in row_iteration]

    # How many documents to look at
    if len(sys.argv) < 2:
        documents_to_analyze = int(number_of_documents / batch_size)
    else:
        documents_to_analyze = int(sys.argv[1])

    # Our vocabulary
    all_keywords_file_path = "../../keywords/abstract_109.txt"
    with read_pickle_file(all_keywords_file_path) as content:
        vocab = list(content)

    # Initialize the algorithm with alpha=1/K, eta=1/K, tau_0=1024, kappa=0.7
    olda = onlineldavb.OnlineLDA(vocab, number_of_topics, number_of_documents,
                                 1. / number_of_topics, 1. / number_of_topics,
                                 1024., 0.7)

    # Run until we've seen D documents. (Feel free to interrupt *much*
    # sooner than this.)

    for iteration in range(0, documents_to_analyze):

        # set dataset as list that stores all abstracts
        doc_set = abstracts

        # Give them to online LDA
        (gamma, bound) = olda.update_lambda(doc_set)

        # Compute an estimate of held-out perplexity
        (word_ids,
         word_count_times) = onlineldavb.parse_doc_list(doc_set, olda.vocab)

        per_word_bound = bound * len(doc_set) / (
            number_of_documents * sum(map(sum, word_count_times)))
        print '%d:  rho_t = %f,  held-out perplexity estimate = %f' % \
            (iteration, olda.rhot, numpy.exp(-per_word_bound))

        # Save lambda, the parameters to the variational distributions
        # over topics, and gamma, the parameters to the variational
        # distributions over topic weights for the articles analyzed in
        # the last iteration.

        if iteration % 10 == 0:
            numpy.savetxt('lambda-%d.dat' % iteration, olda._lambda)
            numpy.savetxt('gamma-%d.dat' % iteration, gamma)
Esempio n. 31
0
def main():
    """
    Downloads and analyzes a bunch of random Wikipedia articles using
    online VB for LDA.
    """
    wn.ensure_loaded()
    wiki_pool = wiki_local.WikiPool()
    # The number of documents to analyze each iteration
    batchsize = 1
    # The total number of documents in Wikipedia
    D = 3.3e6
    # The number of topics
    K = 30

    # How many documents to look at
    if (len(sys.argv) < 2):
        documentstoanalyze = int(D / batchsize)
    else:
        documentstoanalyze = int(sys.argv[1]) + 1

    # Our vocabulary
    #vocab = file('./dictnostops.txt').readlines()
    #vocab = file('./wordnet_nouns.txt').readlines()
    #vocab = file('./synset_dict.txt').readlines()
    #vocab = file('./wn_ambig_no_stop.txt').readlines()
    vocab = file('./mixed_wn_dict.txt').readlines()
    #vocab = []
    #for word in words.words():
    #    word = str(word).lower()
    #    word = re.sub(r'[^a-z]', '', word)
    #    if word != '':
    #        vocab.append(word)
    ##we get repeats because of upper -> lowercase?
    #vocab = set(vocab)
    #vocab = list(vocab)
    W = len(vocab)
    print W

    # Initialize the algorithm with alpha=1/K, eta=1/K, tau_0=1024, kappa=0.7
    olda = onlineldavb.OnlineLDA(vocab, K, D, 1. / K, 1. / K, 1024., 0.7)

    # Run until we've seen D documents. (Feel free to interrupt *much*
    # sooner than this.)
    for iteration in range(0, documentstoanalyze):
        # Download some articles
        (docset, articlenames) = \
            wiki_pool.get_random_wikipedia_articles(batchsize)
        # Give them to online LDA
        (gamma, bound) = olda.update_lambda_docs(docset)
        # Compute an estimate of held-out perplexity
        (wordids, wordcts) = onlineldavb.parse_doc_list(docset, olda._vocab)
        perwordbound = bound * len(docset) / (D * sum(map(sum, wordcts)))
        print '%d:  rho_t = %f,  held-out perplexity estimate = %f' % \
            (iteration, olda._rhot, numpy.exp(-perwordbound))

        # Save lambda, the parameters to the variational distributions
        # over topics, and gamma, the parameters to the variational
        # distributions over topic weights for the articles analyzed in
        # the last iteration.
        if (iteration % 50 == 0):
            numpy.savetxt(
                'data_ground_truth_disambig/lambda-%d.dat' % iteration,
                olda._lambda)
            numpy.savetxt(
                'data_ground_truth_disambig/gamma-%d.dat' % iteration, gamma)

    numpy.savetxt('data_ground_truth_disambig/lambda-%d.dat' % iteration,
                  olda._lambda)
    numpy.savetxt('data_ground_truth_disambig/gamma-%d.dat' % iteration, gamma)
    print "finished iterations"
    wiki_pool.end()
Esempio n. 32
0
def main():
    
    
    articles = list()
    artnames = list()
    
    for line in  file('./jacm/withIDAbstracts.txt').readlines():
        combo=line.split('\t')
        artnames.append(combo[0])
        articles.append(combo[1])
    
    
    
    
    
    """
    Downloads and analyzes a bunch of random Wikipedia articles using
    online VB for LDA.
    """

    # The number of documents to analyze each iteration
    batchsize = 1
    # The total number of documents in Wikipedia
    D = len(artnames)
    # The number of topics
    K = 54

    # How many documents to look at
 
    documentstoanalyze = len(artnames)

    # Our vocabulary
    vocab = file('./dictnostops.txt').readlines()
    W = len(vocab)

    # Initialize the algorithm with alpha=1/K, eta=1/K, tau_0=1024, kappa=0.7
    olda = onlineldavb.OnlineLDA(vocab, K, D, 1./K, 1./K, 1024., 0.7)
    # Run until we've seen D documents. (Feel free to interrupt *much*
    # sooner than this.)
    for iteration in range(0, D):
        # Download some articles
        docset=list()
        docset.append(articles[iteration])
        articlenames=list()
        articlenames.append(artnames[iteration])
      
        # Give them to online LDA
        (gamma, bound) = olda.update_lambda(docset)
        # Compute an estimate of held-out perplexity
        (wordids, wordcts) = onlineldavb.parse_doc_list(docset, olda._vocab)
        
        print bound
        
        perwordbound = bound * len(docset) / (D * sum(map(sum, wordcts)))
        print '%d:  rho_t = %f,  held-out perplexity estimate = %f' % \
            (iteration, olda._rhot, numpy.exp(-perwordbound))

        # Save lambda, the parameters to the variational distributions
        # over topics, and gamma, the parameters to the variational
        # distributions over topic weights for the articles analyzed in
        # the last iteration.
        numpy.savetxt('./simpleLDA/gamma-%d.dat' % iteration, gamma)
        if (iteration % 50 == 0 or iteration==616):
            numpy.savetxt('./simpleLDA/lambda-%d.dat' % iteration, olda._lambda)
Esempio n. 33
0
def main(argv):

    doc_list = []

    argList = handleArgs(argv)
    #list the docs in pickledDocs folder
    p = "../data/pickledDocs/"
    l = listdir(p)
    fileList = [p + f for f in l]

    #for each pickled doclist, append all docs to master doclist
    for fi in fileList:
        with open(fi, 'rb') as d:
            docs = cPickle.load(d)
        for k, x in docs.iteritems():
            doc_list.append(x)
        print len(doc_list)

    #D is total number of docs to show to the model, K is number of topics
    goal_its = 80  #number of iterations to run LDA
    corp_size = len(doc_list)  #number of documents in the corpus
    D = corp_size * goal_its  #number of documents expected to see
    K = 10  #default topic value, if none given in parameters
    saveModel = False  #whether to save LDA model itself
    desc = ""  #for performing non-standard runs
    version = ""  #for having multiple models with same parameters
    hyper_param = ""  #for testing hyperparameters

    #define the vocabulary file we will be using
    vocab = helper_funcs.read_dict("../data/dictionary.txt")  #default dict

    #initialize an instance of the OnlineLDA algorithm
    #parameters - dictionary, num topics, learning rate, beta, tau, kappa
    #if the path to an OnlineLDA pickle is passed, it re-opens that pickle

    K = int(argList[0])
    vocab = vocab = str.split(file(argList[1]).read())
    if not (argList[2] is None):
        alpha = argList[2]
    else:
        alpha = 0.1
    if not (argList[3] is None):
        beta = argList[3]
    else:
        beta = 1.

    saveModel = False
    lda = onlineldavb.OnlineLDA(vocab, K, D, alpha, beta, 1024, 0.)
    print "created LDA with parameters:\nnumwords: " + str(
        len(vocab)) + "\n#topics: " + str(K) + "\nalpha: " + str(
            alpha) + "\nbeta: " + str(beta)

    paramTitle = hyper_param + str(
        len(vocab) / 1000) + "kwords_" + str(K) + "topics"

    folder = "../data/out/models/" + paramTitle
    if not isdir(folder):
        mkdir(folder)

    W = len(vocab)

    print "dictionary size: " + str(W)
    print paramTitle

    print folder
    #if desc.find("label") > -1:
    #    with open("../data/out/past_models/"+paramTitle+"/dictionary.txt",'wb') as f:
    #        voc = sorted(vocab.items(),key=operator.itemgetter(1))
    #        for x in voc:
    #            f.write(x[0]+"\n")
    #perform LDA on the document list for goal_its iterations, updating lambda
    for i in range(lda._updatect, goal_its):
        print doc_list
        print i
        (gamma, bound) = lda.update_lambda(doc_list)

        (wordids, wordcts) = onlineldavb.parse_doc_list(doc_list, lda._vocab)
        perwordbound = bound * len(doc_list) / (D * sum(map(sum, wordcts)))
        print np.exp(-perwordbound)

        #pickle the model and its output occasionally
        if (i + 1) == goal_its:
            if not isdir(folder):
                mkdir(folder)
            with open(folder + "/gamma.pickle", 'wb') as f:
                cp2 = cPickle.Pickler(f)
                cp2.dump(gamma)
            with open(folder + "/lambda.pickle", 'wb') as f:
                cp = cPickle.Pickler(f)
                cp.dump(lda._lambda)
            np.savetxt(folder + '/lambda.dat', lda._lambda)

            if saveModel:

                with open(folder + "/LDA.pickle", 'wb') as f:
                    cp3 = cPickle.Pickler(f)
                    cp3.dump(lda)
def main():
    """
    Downloads and analyzes a bunch of random Wikipedia articles using
    online VB for LDA.
    """
    comm = MPI.COMM_WORLD
    size = comm.Get_size()
    rank = comm.Get_rank()
    # The number of documents to analyze each iteration
    batchsize = 100
    # The total number of documents in Wikipedia
    D = 1000  #D = 2129792 for the whole set
    # The number of topics
    K = 30

    # Our vocabulary
    vocab = file('./com_all_words.txt').readlines()
    W = len(vocab)

    # Initialize the algorithm with alpha=1/K, eta=1/K, tau_0=1024, kappa=0.7
    olda = onlineldavb.OnlineLDA(vocab, K, D, 1. / K, 1. / K, 1024., 0.7)
    # Run until we've seen D documents. (Feel free to interrupt *much*
    # sooner than this.)
    iteration = 0
    while iteration * batchsize * size <= D:
        # Download some articles
        docset = []
        counts = []
        linecache.clearcache()
        startpoint = iteration * batchsize * size + batchsize * rank + 1
        if startpoint > D:  # search to the end
            break  # stop
# get the paper keywords in batches
        for i in range(batchsize):
            f1 = open('com_all_key.txt', 'r')
            f2 = open('com_all.txt', 'r')
            docset.append(
                linecache.getline('com_all_key.txt', min(D, startpoint))[:-1])
            counts.append(
                linecache.getline('com_all.txt', min(D, startpoint))[:-1])
            startpoint = startpoint + 1
# print type(docset), type(docset[0]), docset[0]
# Give them to online LDA
        (gamma, bound) = olda.update_lambda(docset, counts)
        # Compute an estimate of held-out perplexity
        (wordids,
         wordcts) = onlineldavb.parse_doc_list(docset, olda._vocab, counts)
        # print wordcts[0:5]
        perwordbound = bound * len(docset) / (D * sum(map(sum, wordcts)))
        print '%d:  rho_t = %f,  held-out perplexity estimate = %f' % \
            (iteration, olda._rhot, numpy.exp(-perwordbound))
        iteration = iteration + 1
        # Save lambda, the parameters to the variational distributions
        # over topics, and gamma, the parameters to the variational
        # distributions over topic weights for the articles analyzed in
        # the last iteration.

# print olda._lambda[0]
    gammas = comm.gather(gamma, root=0)
    lambdas = comm.gather(olda._lambda, root=0)
    if rank == 0:
        gamma_result = numpy.vstack((x for x in gammas))
        lambda_result = numpy.vstack((x for x in lambdas))
        numpy.savetxt('lambda_parallel.dat', olda._lambda)
        numpy.savetxt('gamma_parallel.dat', gamma)
Esempio n. 35
0
def main(argv):
    
    doc_list    =   []
    
    K, folder, alpha, beta, saveModel     = handleArgs(argv)
    #list the docs in pickledDocs folder
    p   =   "../data/pickledDocs/"
    l   =   listdir(p)
    fileList    =   [p+f for f in l]
    
    #for each pickled doclist, append all docs to master doclist
    with open(folder.replace("dictionary","filelist"),'wb') as f:
        for fi in fileList:
            with open(fi,'rb') as d:
                docs    =   cPickle.load(d)
            for k,x in docs.iteritems(): 
                doc_list.append(x)
                f.write(k+"\n")
            print len(doc_list)
        
        
    
    #D is total number of docs to show to the model, K is number of topics
    goal_its    =   40                #number of iterations to run LDA 
    corp_size   =   len(doc_list)       #number of documents in the corpus
    D           =   corp_size*goal_its  #number of documents expected to see
    #K           =   10                  #default topic value, if none given in parameters
    #saveModel   =   False               #whether to save LDA model itself
    desc        =   ""                  #for performing non-standard runs
    version     =   ""                  #for having multiple models with same parameters
    hyper_param =   ""                  #for testing hyperparameters
    
    
    #initialize an instance of the OnlineLDA algorithm
    #parameters - dictionary, num topics, learning rate, beta, tau, kappa
    #if the path to an OnlineLDA pickle is passed, it re-opens that pickle

    #K           =   int(argList[0])
    vocab       =   vocab = str.split(file(folder).read())
#    if not (argList[2] is None):
#        alpha   = argList[2]
#    else:
#        alpha   =   0.1
#    if not (argList[3] is None):
#        beta    = argList[3]
#    else:
#        beta     =   1.
#
#    saveModel   =   argList[4]
    lda         =   onlineldavb.OnlineLDA(vocab,K,D,alpha,beta,1024,0.)
    print "created LDA with parameters:\nnumwords: "+str(len(vocab))+"\n#topics: "+str(K)+"\nalpha: "+str(alpha)+"\nbeta: "+str(beta)
           
    paramTitle  =   hyper_param+str(len(vocab)/1000)+"kwords_"+str(K)+"topics"
    
    folder  = "../data/out/models/"+paramTitle
    if not isdir(folder):
        mkdir(folder)
    
    W           =   len(vocab)
    
    print "dictionary size: " + str(W)
    print paramTitle
    print folder
    #if desc.find("label") > -1:
    #    with open("../data/out/past_models/"+paramTitle+"/dictionary.txt",'wb') as f:
    #        voc = sorted(vocab.items(),key=operator.itemgetter(1))
    #        for x in voc:
    #            f.write(x[0]+"\n")
    #perform LDA on the document list for goal_its iterations, updating lambda
    for i in range(lda._updatect,goal_its):
        print i
        (gamma, bound)      = lda.update_lambda(doc_list)
        
        (wordids, wordcts)  = onlineldavb.parse_doc_list(doc_list,lda._vocab)
        perwordbound        = bound * len(doc_list) / (D*sum(map(sum,wordcts)))
        print np.exp(-perwordbound)
        
        #pickle the model and its output occasionally
        if (i+1) == goal_its:
            if not isdir(folder):
                mkdir(folder)
            with open(folder+"/gamma.pickle",'wb') as f:
                cp2 = cPickle.Pickler(f)
                cp2.dump(gamma)
            with open(folder+"/lambda.pickle",'wb') as f:
                cp  = cPickle.Pickler(f)
                cp.dump(lda._lambda)
            np.savetxt(folder+'/lambda.dat', lda._lambda)
            
            
            if not (saveModel is None):
                
                with open(folder+"/LDA.pickle",'wb') as f:
                    cp3 = cPickle.Pickler(f)
                    cp3.dump(lda)
Esempio n. 36
0
if not len(sys.argv) == 4:
    folder  = "../data/out/"+paramTitle


print folder
#if desc.find("label") > -1:
#    with open("../data/out/past_models/"+paramTitle+"/dictionary.txt",'wb') as f:
#        voc = sorted(vocab.items(),key=operator.itemgetter(1))
#        for x in voc:
#            f.write(x[0]+"\n")
#perform LDA on the document list for goal_its iterations, updating lambda
for i in range(lda._updatect,goal_its):
    print i
    (gamma, bound)      = lda.update_lambda(doc_list)
    
    (wordids, wordcts)  = onlineldavb.parse_doc_list(doc_list,lda._vocab)
    perwordbound        = bound * len(doc_list) / (D*sum(map(sum,wordcts)))
    print np.exp(-perwordbound)
    
    #pickle the model and its output occasionally
    if (i+1) == goal_its:
        print doc_list[0]
        print gamma[0]
        if not isdir(folder):
            mkdir(folder)
        with open(folder+"/gamma.pickle",'wb') as f:
            cp2 = cPickle.Pickler(f)
            cp2.dump(gamma)
        with open(folder+"/lambda.pickle",'wb') as f:
            cp  = cPickle.Pickler(f)
            cp.dump(lda._lambda)
def main():
    # LDA: a documents contains all the keywords of some journal/conference
    # equivalent to cluster keywords over journals/conferences
    journal_or_conference = sys.argv[1]
    num = int(sys.argv[2])
    conn = jcke.get_db_conn()

    # default (num <=0 or > max number): figure out all the journals/conferences keywords
    if num <= 0 or (num >= 15151 and journal_or_conference == "journal") or (
            num >= 4545 and journal_or_conference == "conference"):
        query = """
		SELECT COUNT(*) FROM ##journal_or_conference##
		"""
        query = query.replace("##journal_or_conference##",
                              journal_or_conference)
        conn.cursor.execute(query)
        num = conn.cursor.fetchall(
        )  # number of journals/conferences to process

    # document parsing
    journal_conf_list = os.listdir("journal_conf_keyword")
    # check if txt files exist, then generate those docs
    if not num == len(journal_conf_list):
        jcke.journal_conf_keyword_generation(conn, num, journal_or_conference)
        journal_conf_list = os.listdir("journal_conf_keyword")

    # The number of journal/conference keyword sets in each batch
    batchsize = lambda num: num if num <= 100 else 100
    batch = batchsize(num)
    iteration_times = int(num / batch)
    # The total number of journals/conferences
    DocNum = lambda journal_or_conference: 15151 if journal_or_conference == "journal" else 4545
    D = DocNum(journal_or_conference)
    # The number of topics
    K = 100  # maybe some other numbers
    # Our vocabulary : we need some vocabulary set!
    vocab = dict()
    # Initialize the algorithm with alpha=1/K, eta=1/K, tau_0=1024, kappa=0.7
    online_LDA = lda.OnlineLDA(vocab, K, D, 1. / K, 1. / K, 1024., 0.7)

    # Run until we've seen D documents. (Feel free to interrupt *much* sooner than this.)
    for iteration in range(0, iteration_times):
        # getting documents (keyword sets)
        if iteration != iteration_times - 1:
            journal_conf_keyword_list = jcke.input_journal_conf_keywords(
                journal_conf_list[iteration * batch:(iteration + 1) * batch])
        else:
            journal_conf_keyword_list = jcke.input_journal_conf_keywords(
                journal_conf_list[iteration * batch:])
    # online LDA for keyword sets
        # here we update the relative function in the package	(dangerous!)
        online_LDA._vocab = jcke.vocabulary_generation(
            journal_conf_keyword_list, online_LDA._vocab)

        (gamma, bound) = online_LDA.update_lambda(journal_conf_keyword_list)
        # Compute an estimate of held-out perplexity
        (keywordids,
         keywordcts) = lda.parse_doc_list(journal_conf_keyword_list,
                                          online_LDA._vocab)
        perkeywordbound = bound * len(journal_conf_keyword_list) / (
            D * sum(map(sum, keywordcts)))
        print '%d:  rho_t = %f,  held-out perplexity estimate = %f' % \
            (iteration, online_LDA._rhot, numpy.exp(-perkeywordbound))

    # Save lambda, the parameters to the variational distributions over topics, and gamma, the parameters to the variational
    # distributions over topic weights for the articles analyzed in the last iteration.
    numpy.savetxt('lambda-%s.dat' % journal_or_conference, online_LDA._lambda)
    numpy.savetxt('gamma-%s.dat' % journal_or_conference, gamma)
Esempio n. 38
0
def fit_olda(parse, doc_path, doc_file, vocab_file, outdir, K, batch_size, \
    iterations, verbose_topics, anchors, tmv_pickle, lemmatize, final_pass, \
    full_doc_topics):
    """
    Analyzes a set of documents using online VB for LDA.
    """
    # instance to generate radom documents
    if parse == "live":  # read and parse docs on the fly using vocab
        docgen = generalrandom.LiveparseDocGen(doc_path)
    else:  # alternative: preparsed
        docgen = generalrandom.PreparseDocGen(doc_file)

    # The total number of documents in Wikipedia
    D = docgen.getDocCount()
    if iterations == 0:
        iterations = max(D / batch_size, 10)

    # Our vocabulary
    if parse == "live" or verbose_topics:
        vocab = [term.strip() for term in file(vocab_file).readlines()]
        W = len(vocab)
    else:
        W = docgen.getTermCount()
        vocab = ["term " + str(w) for w in range(W)]

    # write out general settings to pickle file for use by TMV later
    if tmv_pickle:
        # save model settings: vocab, K, docgen
        f = open(join(outdir, 'settings.pickle'), 'w+')
        cPickle.dump((vocab, K, docgen, lemmatize), f)
        f.close()

    # Initialize the algorithm with alpha=1/K, eta=1/K, tau_0=1024, kappa=0.7
    olda = onlineldavb.OnlineLDA(vocab, K, D, 1./K, 1./K, 1024., 0.7, anchors, \
        lem = lemmatize, preparsed = (parse == "preparsed"))
    # Run until we've seen D documents. (Feel free to interrupt *much*
    # sooner than this.)

    iteration = 0
    old_perplexity = 1.0 * sys.maxint
    delta_perplexity = 1.0 * sys.maxint
    delta_perplexities = [old_perplexity] * 10
    logfile = open(join(outdir, 'log.out'), 'w+')

    while iteration < iterations and sum(
            delta_perplexities
    ) / 10 > 0.001:  # 0.1% change in sample perplexity

        iter_start = time.time()

        # Download some articles
        docset = docgen.get_random_articles(batch_size)

        # Give them to online LDA
        (gamma, bound) = olda.update_lambda(docset)

        # Compute an estimate of held-out perplexity
        if parse == "live":
            (wordids, wordcts) = onlineldavb.parse_doc_list(docset, \
                olda._vocab, lemmatize)
        else:
            (wordids, wordcts) = docset

        # estimate perpexity with the current batch
        perwordbound = bound * len(docset) / (D * sum(map(sum, wordcts)))
        perplexity = numpy.exp(-perwordbound)
        delta_perplexity = abs(old_perplexity - perplexity) / perplexity
        print '%d:  rho_t = %f,  held-out perplexity estimate = %f (%.2f%%)' % \
            (iteration, olda._rhot, perplexity, delta_perplexity * 100)
        logfile.write(
            '%d:  rho_t = %f,  held-out perplexity estimate = %f (%.2f%%)\n' %
            (iteration, olda._rhot, perplexity, delta_perplexity * 100))
        old_perplexity = perplexity
        delta_perplexities.pop(0)
        delta_perplexities.append(delta_perplexity)

        # Save lambda, the parameters to the variational distributions
        # over topics, and gamma, the parameters to the variational
        # distributions over topic weights for the articles analyzed in
        # the last iteration.
        if (iteration % 10 == 0):
            numpy.savetxt(join(outdir, 'lambda-%d.dat' % iteration), \
                olda._lambda)
            numpy.savetxt(join(outdir, 'gamma-%d.dat' % iteration), gamma)

            if verbose_topics:
                print_topics(K, 7, vocab, olda._lambda, anchors)

        iteration += 1

    logfile.close()

    if tmv_pickle:
        f = open(join(outdir, 'olda.pickle'), 'w+')
        cPickle.dump(olda, f)
        f.close()

    # save final iters
    numpy.savetxt(join(outdir, 'lambda-final.dat'), olda._lambda)
    numpy.savetxt(join(outdir, 'gamma-%d.dat' % iteration), gamma)

    # do a final pass on all documents
    if (final_pass):
        fout = open(join(outdir, "gamma-final.dat"), 'w+')
        if not full_doc_topics:
            fout.write("doc.lda.id\ttopic.id\tscore\n")

        i = 0
        for doc in docgen:
            if parse == 'live':  #TODO: the parsers should return same order...
                doc = doc[1]
            (gamma, ss) = olda.do_e_step(doc)
            j = 0
            if not full_doc_topics:
                for g in gamma.tolist()[0]:
                    if g > 0.051:
                        fout.write("%d\t%d\t%f\n" % (i, j, g))
                    j += 1
                i += 1
            else:
                gf = gamma.tolist()[0]
                fout.write(('\t'.join(["%f"] * len(gf)) + '\n') % tuple(gf))
        fout.close()
Esempio n. 39
0
def main():
    """
    Downloads and analyzes a bunch of random Wikipedia articles using
    online VB for LDA.
    """

    # The number of documents to analyze each iteration
    batchsize = 5
    # The total number of documents in Wikipedia
    #D = 10
    # The number of topics
    K = 20

    #load my own dataset
    f = open('../annotated/odata.dat', 'rb')
    data = cPickle.load(f)
    f.close()

    if '-S' in sys.argv:
        catog = 'pos'
    else:
        catog = 'neg'
        
    docset = []
    for each in data:
        if each[1] == catog:
            docset.append(each[0][0][0])

    D = len(docset) #number of docset
    print D

    # How many documents to look at
    if (len(sys.argv) < 2):
        documentstoanalyze = int(D/batchsize)
    else:
        documentstoanalyze = int(sys.argv[1])

    # Our vocabulary
    vocab = open('./dictnostops.txt').readlines()
    W = len(vocab)


    # Initialize the algorithm with alpha=1/K, eta=1/K, tau_0=1024, kappa=0.7
    olda = onlineldavb.OnlineLDA(vocab, K, D, 1./K, 1./K, 1024., 0.7)
    # Run until we've seen D documents. (Feel free to interrupt *much*
    # sooner than this.)
    for iteration in range(0, documentstoanalyze):
        # Download some articles
        #(docset, articlenames) = \
        #    wikirandom.get_random_wikipedia_articles(batchsize)
        #print type(docset[1]), type(docset[0])
        # Give them to online LDA
        
        (gamma, bound) = olda.update_lambda(docset)
        # Compute an estimate of held-out perplexity
        (wordids, wordcts) = onlineldavb.parse_doc_list(docset, olda._vocab)
        perwordbound = bound * len(docset) / (D * sum(map(sum, wordcts)))
        print '%d:  rho_t = %f,  held-out perplexity estimate = %f' % \
            (iteration, olda._rhot, numpy.exp(-perwordbound))

        # Save lambda, the parameters to the variational distributions
        # over topics, and gamma, the parameters to the variational
        # distributions over topic weights for the articles analyzed in
        # the last iteration.
        if (iteration % 10 == 0):
            numpy.savetxt('lambda-%d.dat' % iteration, olda._lambda)
            numpy.savetxt('gamma-%d.dat' % iteration, gamma)
def main():
    """
    Retrieves the content of a set of text files whose content is obtained
    from SOAP API descriptors.
    """
    path = sys.argv[1]
    docs = os.listdir(path)

    # The number of documents to analyze each iteration
    rest = 1
    #batchsize = int(math.ceil(len(docs)/100))
    batchsize = 15
    #print len(docs)
    #while rest != 0:
    #    rest = len(docs) % batchsize
    #    if (rest != 0):
    #        batchsize = batchsize + 1
        
    # The total number of documents (is supposed to be a huge/infinite number in an online setting)
    D = 3.3e6
    #D = len(docs)
    # The number of topics
    K = 40
    #K = 50

    # How many documents to look at
    #print batchsize
    #print sys.argv[0]
    if (len(sys.argv) == 2):
        #print 'Got into IF...'
        documentstoanalyze = int(math.ceil(len(docs)/float(batchsize)))
    elif (len(sys.argv) == 3):
        documentstoanalyze = int(sys.argv[2])
    elif (len(sys.argv) == 4):
        documentstoanalyze = int(sys.argv[2])
        K = int(sys.argv[3])

    
    #print documentstoanalyze
    # Our vocabulary
    #vocab = file('./dictnostops.txt').readlines()
    vocab = file('./wlist_match10.txt').readlines()

    W = len(vocab)

    # Initialize the algorithm with alpha=1/K, eta=1/K, tau_0=1024, kappa=0.7
    olda = onlineldavb.OnlineLDA(vocab, K, D, 0.5, 0.5, 1024., 0.7)
    #olda = onlineldavb.OnlineLDA(vocab, K, D, 1./K, 1./K, 1024., 0.7)
    # Dictionary for storing gamma values of the processed text files
    gamma_all = dict()
    #olda = onlineldavb.OnlineLDA(vocab, K, D, 0.01, 0.01, 1024., 0.7)
    for iteration in range(1, documentstoanalyze+1):
        # Download some articles
        (docset, operation_id) = \
            get_file_content(iteration, batchsize, path)
        # Give them to online LDA
        (gamma, bound) = olda.update_lambda(docset)
        # Compute an estimate of held-out perplexity
        (wordids, wordcts) = onlineldavb.parse_doc_list(docset, olda._vocab)
        perwordbound = bound * len(docset) / (D * sum(map(sum, wordcts)))
        #print ('iteration %d:  rho_t = %f,  held-out perplexity estimate = %f ' % \
        #    (iteration, olda._rhot, numpy.exp(-perwordbound)))
        sys.stdout.write('\rBatchs of document analyzed: %d/%d' % (iteration, documentstoanalyze))
        sys.stdout.flush()

        # Store the gamma values into the gamma_all for each one of the text files
        # in the current iteration 
	for i in range(len(operation_id)):
            gamma_all[operation_id[i]] = list(gamma[i])

        print(iteration, resource.getrusage(resource.RUSAGE_SELF).ru_maxrss)

        # Save lambda, the parameters to the variational distributions
        # over topics, and gamma, the parameters to the variational
        # distributions over topic weights for the text files analyzed in
        # the last iteration.
        #if (iteration % 10 == 0):
        #    numpy.savetxt('parameters/lambda-%d.dat' % iteration, olda._lambda)
        #    numpy.savetxt('parameters/gamma-%d.dat' % iteration, gamma)
        if (iteration == documentstoanalyze):
            numpy.savetxt('parameters/lambda-all.dat', olda._lambda)
    # Save gamma_all for all the processed text files
    print '\n'
    temp = gamma_all.items()
    temp = sorted(temp, key = lambda x: x[0])
    numpy.savetxt('parameters/gamma-all.dat' , [item[1] for item in temp])
Esempio n. 41
0
def fit_olda(parse, doc_path, doc_file, vocab_file, outdir, K, batch_size, \
    iterations, verbose_topics, anchors, tmv_pickle, lemmatize, final_pass, \
    full_doc_topics):
    """
    Analyzes a set of documents using online VB for LDA.
    """
    # instance to generate radom documents
    if parse == "live": # read and parse docs on the fly using vocab
        docgen = generalrandom.LiveparseDocGen(doc_path)
    else: # alternative: preparsed
        docgen = generalrandom.PreparseDocGen(doc_file)

    # The total number of documents in Wikipedia
    D = docgen.getDocCount()
    if iterations == 0:
        iterations = max(D / batch_size, 10)

    # Our vocabulary
    if parse == "live" or verbose_topics:
        vocab = [term.strip() for term in file(vocab_file).readlines()]
        W = len(vocab)
    else:
        W = docgen.getTermCount()
        vocab = ["term " + str(w) for w in range(W)]

    # write out general settings to pickle file for use by TMV later
    if tmv_pickle:
        # save model settings: vocab, K, docgen
        f = open(join(outdir, 'settings.pickle'), 'w+')
        cPickle.dump((vocab, K, docgen, lemmatize), f)
        f.close()

    # Initialize the algorithm with alpha=1/K, eta=1/K, tau_0=1024, kappa=0.7
    olda = onlineldavb.OnlineLDA(vocab, K, D, 1./K, 1./K, 1024., 0.7, anchors, \
        lem = lemmatize, preparsed = (parse == "preparsed"))
    # Run until we've seen D documents. (Feel free to interrupt *much*
    # sooner than this.)

    iteration = 0
    old_perplexity = 1.0 * sys.maxint
    delta_perplexity = 1.0 * sys.maxint
    delta_perplexities = [old_perplexity] * 10
    logfile = open(join(outdir, 'log.out'), 'w+')


    while iteration < iterations and sum(delta_perplexities)/10 > 0.001: # 0.1% change in sample perplexity

        iter_start = time.time()

        # Download some articles
        docset = docgen.get_random_articles(batch_size)

        # Give them to online LDA
        (gamma, bound) = olda.update_lambda(docset)


        # Compute an estimate of held-out perplexity
        if parse == "live":
            (wordids, wordcts) = onlineldavb.parse_doc_list(docset, \
                olda._vocab, lemmatize)
        else:
            (wordids, wordcts) = docset

        # estimate perpexity with the current batch
        perwordbound = bound * len(docset) / (D * sum(map(sum, wordcts)))
        perplexity = numpy.exp(-perwordbound)
        delta_perplexity = abs(old_perplexity - perplexity) / perplexity
        print '%d:  rho_t = %f,  held-out perplexity estimate = %f (%.2f%%)' % \
            (iteration, olda._rhot, perplexity, delta_perplexity * 100)
        logfile.write('%d:  rho_t = %f,  held-out perplexity estimate = %f (%.2f%%)\n' % (iteration, olda._rhot, perplexity, delta_perplexity * 100))
        old_perplexity = perplexity
        delta_perplexities.pop(0)
        delta_perplexities.append(delta_perplexity)


        # Save lambda, the parameters to the variational distributions
        # over topics, and gamma, the parameters to the variational
        # distributions over topic weights for the articles analyzed in
        # the last iteration.
        if (iteration % 10 == 0):
            numpy.savetxt(join(outdir, 'lambda-%d.dat' % iteration), \
                olda._lambda)
            numpy.savetxt(join(outdir, 'gamma-%d.dat' % iteration), gamma)

            if verbose_topics:
                print_topics(K, 7, vocab, olda._lambda, anchors)

        iteration += 1

    logfile.close()

    if tmv_pickle:
        f = open(join(outdir,'olda.pickle'), 'w+')
        cPickle.dump(olda, f)
        f.close()

    # save final iters
    numpy.savetxt(join(outdir, 'lambda-final.dat'), olda._lambda)
    numpy.savetxt(join(outdir, 'gamma-%d.dat' % iteration), gamma)

    # do a final pass on all documents
    if (final_pass):
        fout = open(join(outdir, "gamma-final.dat"), 'w+')
        if not full_doc_topics:
            fout.write("doc.lda.id\ttopic.id\tscore\n")

        i = 0
        for doc in docgen:
            if parse == 'live': #TODO: the parsers should return same order...
                doc = doc[1]
            (gamma, ss) = olda.do_e_step(doc)
            j = 0
            if not full_doc_topics:
                for g in gamma.tolist()[0]:
                    if g > 0.051:
                        fout.write("%d\t%d\t%f\n" % (i,j,g))
                    j += 1
                i += 1
            else:
                gf = gamma.tolist()[0]
                fout.write(('\t'.join(["%f"]*len(gf))+'\n') % tuple(gf))
        fout.close()
Esempio n. 42
0
def main():
    """
    Downloads and analyzes a bunch of random Wikipedia articles using
    online VB for LDA.
    """

    # The number of documents to analyze each iteration
    batchsize = 100
    # The total number of documents in Wikipedia
    D = 3.3e6
    # The number of topics
    K = 100
    rho_t_vector = []
    perplexity_vector = []
    time_vector = []
    time1_vector = []

    # How many documents to look at
    if (len(sys.argv) < 2):
        documentstoanalyze = int(D / batchsize)
    else:
        documentstoanalyze = int(sys.argv[1])

    # Our vocabulary
    vocab = file('./dictnostops.txt').readlines()
    W = len(vocab)

    # Initialize the algorithm with alpha=1/K, eta=1/K, tau_0=1024, kappa=0.7

    kappa = 0.7
    olda = onlineldavb.OnlineLDA(vocab, K, D, 1. / K, 1. / K, 1024., kappa)
    # Run until we've seen D documents. (Feel free to interrupt *much*
    # sooner than this.)
    t1 = time.time()
    for iteration in tqdm(range(0, documentstoanalyze)):
        # Download some articles
        (docset, articlenames) = \
            wikirandom.get_random_wikipedia_articles(batchsize)
        # Give them to online LDA
        (gamma, bound) = olda.update_lambda_docs(docset)
        # Compute an estimate of held-out perplexity
        t = time.time()
        (wordids, wordcts) = onlineldavb.parse_doc_list(docset, olda._vocab)
        perwordbound = bound * len(docset) / (D * sum(map(sum, wordcts)))
        print '%d:  rho_t = %f,  held-out perplexity estimate = %f' % \
            (iteration, olda._rhot, numpy.exp(-perwordbound))
        t2 = time.time()
        time_vector.append(t2 - t1)
        if len(time1_vector) == 0:
            time1_vector.append(t2 - t)
        else:
            time1_vector.append(time1_vector[-1] + t2 - t)
        rho_t_vector.append(olda._rhot)
        perplexity_vector.append(perwordbound)

        # Save lambda, the parameters to the variational distributions
        # over topics, and gamma, the parameters to the variational
        # distributions over topic weights for the articles analyzed in
        # the last iteration.
        if (iteration % 10 == 0):
            numpy.savetxt('lambda-%d.dat' % iteration, olda._lambda)
            numpy.savetxt('gamma-%d.dat' % iteration, gamma)

        numpy.savetxt('time_%.1f_%d' % (kappa, batchsize),
                      numpy.array(time_vector))
        numpy.savetxt('rho_%.1f_%d' % (kappa, batchsize),
                      numpy.array(rho_t_vector))
        numpy.savetxt('perplexity_%.1f_%d' % (kappa, batchsize),
                      numpy.array(perplexity_vector))
        numpy.savetxt('time1_%.1f_%d' % (kappa, batchsize),
                      numpy.array(time1_vector))
Esempio n. 43
0
def main():
    """
    Analyzes scraped pages using scikit-learn.LDA
    """
    
    # The number of topics
    K = 10
    # no of documents
    D = 300
    n_features = 1000

    # Our vocabulary
    vocab = list(set(file('./vocab').readlines()))
    W = len(vocab)
    
    # Add terms and topics to the DB
    db.init()
    db.add_terms(vocab)
    db.add_topics(K)
    
    olda = onlineldavb.OnlineLDA(vocab, K, D, 1./K, 1./K, 1024., 0.7)

    # grab documents
    ### Load your scraped pages, re-tokenize, and vectorize result.
    docset, docnames = [], []
    for filename in os.listdir(os.getcwd()):
        if filename.endswith('.html'): 
            tree = html.parse(filename)
            try: encoding = tree.xpath('//meta/@charset')[0]
            except IndexError: encoding = 'utf-8'

            with open(filename) as page:
                rawtext = page.read()
                try: rawtext = rawtext.decode(encoding, errors='backslashreplace')
                except TypeError: continue
                # encoding issues, see http://stackoverflow.com/questions/19527279/python-unicode-to-ascii-conversion
                docset += [clean_html(rawtext)]
                docnames += [filename[:-5]]
                if not(len(docset) % 10): print("loaded " + str(len(docset)) + " documents")

    # Give them to online LDA
    # Also computes an estimate of held-out perplexity
    (wordids, wordcts) = onlineldavb.parse_doc_list(docset, olda._vocab)
    (gamma, bound) = olda.update_lambda(wordids, wordcts)

    
    # Arrays for adding batches of data to the DB
    # doc_array = []
    # doc_term_array = []

    # for d in range(len(docnames)):
        # doc_array.append((docnames[d], docset[d]))
    doc_array = zip(docnames, docset)
        
    # Add a batch of docs to the DB; this is the one DB task that is not in
    # the separate DB write thread since later tasks depend on having doc ids.
    # Since writes take so long, this also balaces the two threads time-wise.
    doc_ids = db.add_docs(doc_array)

    doc_topic_array = []
    for d in range(len(gamma)):
        doc_size = len(docset[d])
        for k in range(len(gamma[d])):
            doc_topic_array.append((doc_ids[d], k, gamma[d][k], gamma[d][k]/doc_size))
    db.add_doc_topics(doc_topic_array)

    perwordbound = bound * len(docset) / (D * sum(map(sum, wordcts)))
    print '%d:  rho_t = %f,  held-out perplexity estimate = %f' % \
        (1, olda._rhot, numpy.exp(-perwordbound))

    # Save lambda, the parameters to the variational distributions
    # over topics, and gamma, the parameters to the variational
    # distributions over topic weights for the articles analyzed in
    # the last iteration.
    numpy.savetxt('lambda-%d.dat' % 1, olda._lambda)
    numpy.savetxt('gamma-%d.dat' % 1, gamma)
        
    topic_terms_array = []
    for topic in range(len(olda._lambda)):
        lambda_sum = sum(olda._lambda[topic])
            
        for term in range(len(olda._lambda[topic])):
            topic_terms_array.append((topic, term, olda._lambda[topic][term]/lambda_sum))
    db.update_topic_terms(K, topic_terms_array)
            
    gc.collect() # probably not necesary, but precautionary for long runs
    db.print_task_update()

    # The DB thread ends only when it has both run out of tasks and it has been
    # signaled that it will not be recieving any more tasks
    db.increment_batch_count()
    db.signal_end()
def main():
    """
    Downloads and analyzes a bunch of random Wikipedia articles using
    online VB for LDA.
    """
    comm = MPI.COMM_WORLD
    size = comm.Get_size()   
    rank = comm.Get_rank()
    # The number of documents to analyze each iteration
    batchsize = 100
    # The total number of documents in Wikipedia
    D = 1000	#D = 2129792 for the whole set
    # The number of topics
    K = 30

    # Our vocabulary
    vocab = file('./com_all_words.txt').readlines()
    W = len(vocab)

    # Initialize the algorithm with alpha=1/K, eta=1/K, tau_0=1024, kappa=0.7
    olda = onlineldavb.OnlineLDA(vocab, K, D, 1./K, 1./K, 1024., 0.7)
    # Run until we've seen D documents. (Feel free to interrupt *much*
    # sooner than this.)
    iteration = 0
    while iteration * batchsize * size <= D:	
        # Download some articles
        docset = []
	counts = []
	linecache.clearcache()
	startpoint = iteration * batchsize * size + batchsize * rank + 1
	if startpoint > D:	# search to the end
	    break	# stop
	# get the paper keywords in batches
	for i in range(batchsize):
	    f1 = open('com_all_key.txt','r')
	    f2 = open('com_all.txt', 'r')
	    docset.append(linecache.getline('com_all_key.txt', min(D, startpoint))[:-1])
	    counts.append(linecache.getline('com_all.txt', min(D, startpoint))[:-1])
	    startpoint = startpoint + 1
	# print type(docset), type(docset[0]), docset[0]
        # Give them to online LDA
        (gamma, bound) = olda.update_lambda(docset, counts)
        # Compute an estimate of held-out perplexity
        (wordids, wordcts) = onlineldavb.parse_doc_list(docset, olda._vocab, counts)
	# print wordcts[0:5]
        perwordbound = bound * len(docset) / (D * sum(map(sum, wordcts)))
        print '%d:  rho_t = %f,  held-out perplexity estimate = %f' % \
            (iteration, olda._rhot, numpy.exp(-perwordbound))
	iteration = iteration + 1
        # Save lambda, the parameters to the variational distributions
        # over topics, and gamma, the parameters to the variational
        # distributions over topic weights for the articles analyzed in
        # the last iteration.
	# print olda._lambda[0]
    gammas = comm.gather(gamma, root = 0)
    lambdas = comm.gather(olda._lambda, root = 0)
    if rank == 0:
	gamma_result = numpy.vstack((x for x in gammas))
	lambda_result = numpy.vstack((x for x in lambdas))
	numpy.savetxt('lambda_parallel.dat', olda._lambda)
        numpy.savetxt('gamma_parallel.dat', gamma)
Esempio n. 45
0
def main():
    """
    Downloads and analyzes a bunch of random Wikipedia articles using
    online VB for LDA.
    """
    global D
    global doc_list
    global last_gamma_file
    cut_words()
    print D
    print len(doc_list)
    # The number of documents to analyze each iteration
    batchsize = 64
    # The total number of documents in Wikipedia
    #D = 500
    # The number of topics
    K = int(sys.argv[1])

    # How many documents to look at
    if (len(sys.argv) < 3):
        documentstoanalyze = int(D / batchsize)
    else:
        documentstoanalyze = int(sys.argv[1])

    # Our vocabulary
    vocab = file('./chineseNoStopWords.txt').readlines()
    #print vocab
    W = len(vocab)

    # Initialize the algorithm with alpha=1/K, eta=1/K, tau_0=1024, kappa=0.7
    olda = onlineldavb.OnlineLDA(vocab, K, D, 1. / K, 1. / K, 1024., 0.7)
    # Run until we've seen D documents. (Feel free to interrupt *much*
    # sooner than this.)
    print documentstoanalyze
    perplexity_set = []
    iter_set = []
    for iteration in range(0, documentstoanalyze):
        # Download some articles
        '''
        (docset, articlenames) = \
            wikirandom.get_random_wikipedia_articles(batchsize)
        '''
        docset = doc_list[iteration * batchsize:(iteration + 1) * batchsize]
        # Give them to online LDA
        (gamma, bound) = olda.update_lambda(docset)
        # Compute an estimate of held-out perplexity
        (wordids, wordcts) = onlineldavb.parse_doc_list(docset, olda._vocab)
        perwordbound = bound * len(docset) / (D * sum(map(sum, wordcts)))
        print '%d:  rho_t = %f,  held-out perplexity estimate = %f' % \
            (iteration, olda._rhot, numpy.exp(-perwordbound))
        perplexity_set.append(numpy.exp(-perwordbound))
        iter_set.append(iteration)
        # Save lambda, the parameters to the variational distributions
        # over topics, and gamma, the parameters to the variational
        # distributions over topic weights for the articles analyzed in
        # the last iteration.
        if (iteration % 100 == 0 or iteration == documentstoanalyze - 1):
            numpy.savetxt(
                './res_' + sys.argv[1] + '/lambda-%d.dat' % iteration,
                olda._lambda)
            numpy.savetxt('./res_' + sys.argv[1] + '/gamma-%d.dat' % iteration,
                          gamma)
    last_gamma_file = './res_' + sys.argv[1] + '/lambda-%d.dat' % (
        documentstoanalyze - 1)
    save_lambda_path = 'last_lambda_' + sys.argv[1] + '.txt'
    flast = open(save_lambda_path, 'w')
    flast.write(last_gamma_file)
    flast.close()
Esempio n. 46
0
def main():
    """
    Downloads and analyzes a bunch of random Wikipedia articles using
    online VB for LDA.
    """

    # The number of documents to analyze each iteration
    batchsize = 64
    # The total number of documents in Wikipedia
    D = 3.3e6
    # The number of topics
    K = 100

    # How many documents to look at
    if (len(sys.argv) < 2):
        documentstoanalyze = int(D/batchsize)
    else:
        documentstoanalyze = int(sys.argv[1])

    # Our vocabulary
    vocab = file('./dictnostops.txt').readlines()
    W = len(vocab)
    
    # Add terms and topics to the DB
    db.init()
    db.add_terms(vocab)
    db.add_topics(K)
    
    # Initialize the algorithm with alpha=1/K, eta=1/K, tau_0=1024, kappa=0.7
    olda = onlineldavb.OnlineLDA(vocab, K, D, 1./K, 1./K, 1024., 0.7)
    # Run until we've seen D documents. (Feel free to interrupt *much*
    # sooner than this.)
    for iteration in range(0, documentstoanalyze):
        # Download some articles
        (docset, articlenames) = \
            wikirandom.get_random_wikipedia_articles(batchsize)
        
        # Give them to online LDA
        (gamma, bound) = olda.update_lambda(docset)
        
        # Compute an estimate of held-out perplexity
        (wordids, wordcts) = onlineldavb.parse_doc_list(docset, olda._vocab)
        
        # Arrays for adding batches of data to the DB
        doc_array = []
        doc_term_array = []
        
        for d in range(len(articlenames)):
            doc_array.append((articlenames[d], docset[d]))
        
        # Add a batch of docs to the DB; this is the one DB task that is not in
        # the separate DB write thread since later tasks depend on having doc ids.
        # Since writes take so long, this also balaces the two threads time-wise.
        doc_ids = db.add_docs(doc_array)
	
        doc_topic_array = []
        for d in range(len(gamma)):
            doc_size = len(docset[d])
            for k in range(len(gamma[d])):
                doc_topic_array.append((doc_ids[d], k, gamma[d][k], gamma[d][k]/doc_size))
        db.add_doc_topics(doc_topic_array)

        perwordbound = bound * len(docset) / (D * sum(map(sum, wordcts)))
        print '%d:  rho_t = %f,  held-out perplexity estimate = %f' % \
            (iteration, olda._rhot, numpy.exp(-perwordbound))

        # Save lambda, the parameters to the variational distributions
        # over topics, and gamma, the parameters to the variational
        # distributions over topic weights for the articles analyzed in
        # the last iteration.
        if (iteration % 10 == 0):
            numpy.savetxt('lambda-%d.dat' % iteration, olda._lambda)
            numpy.savetxt('gamma-%d.dat' % iteration, gamma)
            
            topic_terms_array =[]
            for topic in range(len(olda._lambda)):
                lambda_sum = sum(olda._lambda[topic])
                
                for term in range(len(olda._lambda[topic])):
                    topic_terms_array.append((topic, term, olda._lambda[topic][term]/lambda_sum))
            db.update_topic_terms(K, topic_terms_array)
                
            gc.collect() # probably not necesary, but precautionary for long runs
            db.print_task_update()
        db.increment_batch_count()
    
    # The DB thread ends only when it has both run out of tasks and it has been
    # signaled that it will not be recieving any more tasks
    db.signal_end()
Esempio n. 47
0
    pages = [strs.rstrip() for strs in pages]
    D = len(pages)

    pageID = range(0, D)
    nBatches = D / batchsize

    #Initialize the algorithm with alpha=1/K, eta=1/K, tau_0=1024, kappa=0.7
    lda = onlineldavb.OnlineLDA(vocab, K, D, 1. / K, 1. / K, 128., 0.7)

    #Run
    nBatches = 100
    for iteration in range(0, nBatches):

        #Grab Abstracts
        (docset, pagenames, pages,
         pageID) = grabAbstracts(pages, batchsize, pageID)

        #Give them to online LDA
        (gamma, bound) = lda.update_lambda(docset)

        #Compute an estimate of held-out perplexity
        (wordids, wordcts) = onlineldavb.parse_doc_list(docset, lda._vocab)
        perwordbound = bound * len(docset) / (D * sum(map(sum, wordcts)))
        print '%d:  rho_t = %f,  held-out perplexity estimate = %f' % (
            iteration, lda._rhot, numpy.exp(-perwordbound))

        #Save to file
        if (iteration % 10 == 0):
            numpy.savetxt('lambda.dat', lda._lambda)
            numpy.savetxt('gamma.dat', gamma)