Beispiel #1
0
def main(doc_list, vocab_file):
    batch_size = 64
    D = len(doc_list) # number of documents
    K = 100 # number of topics

    vocab = file(vocab_file).readlines()
    W = len(vocab)

    # Initialize the algorithm with alpha=1/K, eta=1/K, tau_0=1024, kappa=0.7
    olda = onlineldavb.OnlineLDA(vocab, K, D, 1./K, 1./K, 1024., 0.7)
    # Run until we've seen D documents.
    for iteration in range(0, D):
        doc_set = doc_list[batch_size*iteration:(iteration+1)*batch_size]
        # Give them to online LDA
        (gamma, bound) = olda.update_lambda_docs(docset)
        # Compute an estimate of held-out perplexity
        (wordids, wordcts) = onlineldavb.parse_doc_list(docset, olda._vocab)
        perwordbound = bound * len(docset) / (D * sum(map(sum, wordcts)))
        print '%d:  rho_t = %f,  held-out perplexity estimate = %f' % (iteration, olda._rhot, numpy.exp(-perwordbound))

        # Save lambda, the parameters to the variational distributions
        # over topics, and gamma, the parameters to the variational
        # distributions over topic weights for the articles analyzed in
        # the last iteration.
        if (iteration % 10 == 0):
            numpy.savetxt('lambda-%d.dat' % iteration, olda._lambda)
            numpy.savetxt('gamma-%d.dat' % iteration, gamma)
Beispiel #2
0
def main():
    # OLDA parameters
    D = 3.3e6
    K = 200
    with open('./tweetdict_stemmed.txt', 'rb') as f:
        vocab = f.readlines()
    W = len(vocab)
    olda = onlineldavb.OnlineLDA(vocab, K, D, 1. / K, 1. / K, 1024., 0.7)

    # Assign lambdas and the counter to the previous iteration
    if len(sys.argv) > 1:
        olda._lambda = numpy.loadtxt(sys.argv[1])
        counter = int(sys.argv[2])
    else:
        counter = 0

    # collect top words for each topic for hashtag prediction
    top_words = {}
    for k in range(0, len(olda._lambda)):
        lambdak = list(olda._lambda[k, :])
        lambdak = lambdak / sum(lambdak)
        temp = zip(lambdak, range(0, len(lambdak)))
        temp = sorted(temp, key=lambda x: x[0], reverse=True)
        top_words[str(k)] = vocab[temp[0][1]]

    # Connect to Mongo
    try:
        c = Connection(host="localhost", port=27017)
        print "Connected successfully"
    except ConnectionFailure, e:
        sys.stderr.write("Could not connect to MongoDB: %s" % e)
        sys.exit(1)
 def __init__(self,batchsize,d,k,tau,kappa):
     self.__dp=dataParse.dataParse(os.path.abspath("./data/ideas.txt"))
     self.__result=self.__dp.concatedField(os.path.abspath("./data/fieldList.txt"))
     #doc
     self.__doc=self.__result[0]
     #fielddata
     self.__fid=self.__result[1]
     #dictionary
     self.__vocab=file(os.path.abspath('./data/vocabulary.txt')).readlines()
     # the number of words in the dictionary
     self.__W=len(self.__vocab)
     #the number of documents to analyze in each iteration
     self.__batchsize=batchsize
     # the total number of documents
     self.__D=d
     # the number of topics
     self.__K=k
     # the number of iterations
     self.__documentstoanalyze=self.__D/self.__batchsize
     # tau
     self.__tau=tau
     # kappa
     self.__kappa=kappa
     # lda instance (alpha=1/K, eta=1/K, tau_0=1024, kappa=0.1)
     self.__ldaObj=onlineldavb.OnlineLDA(self.__vocab, self.__K, self.__D, 1./self.__K, 1./self.__K, self.__tau*1.0, self.__kappa)
Beispiel #4
0
def main():
    """
    Downloads and analyzes a bunch of random Wikipedia articles using
    online VB for LDA.
    """

    # The number of documents to analyze each iteration
    #batchsize = 64
    batchsize = 32
    # The total number of tweets
    #D=297861
    D = 1163
    # The number of topics
    #K = 20
    K = 10

    # How many documents to look at
    if (len(sys.argv) < 2):
        documentstoanalyze = int(D/batchsize)
    else:
        documentstoanalyze = int(sys.argv[1])

    # Our vocabulary
    vocab = file('dictnostops.txt').readlines()
    W = len(vocab)
    
    #open rawdata
    #train_file = open("congress_train.txt")
    train_file = open("text.txt")
    train = train_file.readlines()

    # Initialize the algorithm with alpha=1/K, eta=1/K, tau_0=1024, kappa=0.7
    #olda = onlineldavb.OnlineLDA(vocab, K, D, 1./K, 1./K, 1024., 0.7)
    olda = onlineldavb.OnlineLDA(vocab, K, D, 1./K, 1./K, 128., 0.7)
    # Run until we've seen D documents. (Feel free to interrupt *much*
    # sooner than this.)
    iter = 0
    for iteration in range(0, documentstoanalyze):
        # Download some articles
        #(docset, articlenames) = \
            #wikirandom.get_random_wikipedia_articles(batchsize)
        docset = train[iter:(iter+batchsize)]
        iter+=batchsize
        # Give them to online LDA
        (gamma, bound) = olda.update_lambda(docset)
        # Compute an estimate of held-out perplexity
        (wordids, wordcts) = onlineldavb.parse_doc_list(docset, olda._vocab)
        print wordids
        print wordcts
        perwordbound = bound * len(docset) / (D * sum(map(sum, wordcts)))
        print '%d:  rho_t = %f,  held-out perplexity estimate = %f' % \
            (iteration, olda._rhot, numpy.exp(-perwordbound))

        # Save lambda, the parameters to the variational distributions
        # over topics, and gamma, the parameters to the variational
        # distributions over topic weights for the articles analyzed in
        # the last iteration.
        if (iteration % 10 == 0):
            numpy.savetxt('lambda-%d.dat' % iteration, olda._lambda)
            numpy.savetxt('gamma-%d.dat' % iteration, gamma)
Beispiel #5
0
def main():

    articles = list()
    artnames = list()

    for line in file('./jacm/withIDAbstracts.txt').readlines():
        combo = line.split('\t')
        artnames.append(combo[0])
        articles.append(combo[1])
    """
    Downloads and analyzes a bunch of random Wikipedia articles using
    online VB for LDA.
    """

    # The number of documents to analyze each iteration
    batchsize = 1
    # The total number of documents in Wikipedia
    D = len(artnames)
    # The number of topics
    K = 54

    # How many documents to look at

    documentstoanalyze = len(artnames)

    # Our vocabulary
    vocab = file('./dictnostops.txt').readlines()
    W = len(vocab)

    # Initialize the algorithm with alpha=1/K, eta=1/K, tau_0=1024, kappa=0.7
    olda = onlineldavb.OnlineLDA(vocab, K, D, 1. / K, 1. / K, 1024., 0.7)
    # Run until we've seen D documents. (Feel free to interrupt *much*
    # sooner than this.)
    for iteration in range(0, D):
        # Download some articles
        docset = list()
        docset.append(articles[iteration])
        articlenames = list()
        articlenames.append(artnames[iteration])

        # Give them to online LDA
        (gamma, bound) = olda.update_lambda(docset)
        # Compute an estimate of held-out perplexity
        (wordids, wordcts) = onlineldavb.parse_doc_list(docset, olda._vocab)

        print bound

        perwordbound = bound * len(docset) / (D * sum(map(sum, wordcts)))
        print '%d:  rho_t = %f,  held-out perplexity estimate = %f' % \
            (iteration, olda._rhot, numpy.exp(-perwordbound))

        # Save lambda, the parameters to the variational distributions
        # over topics, and gamma, the parameters to the variational
        # distributions over topic weights for the articles analyzed in
        # the last iteration.
        numpy.savetxt('./simpleLDA/gamma-%d.dat' % iteration, gamma)
        if (iteration % 50 == 0 or iteration == 616):
            numpy.savetxt('./simpleLDA/lambda-%d.dat' % iteration,
                          olda._lambda)
def main():
    """
    Downloads and analyzes a bunch of random Wikipedia articles using
    online VB for LDA.
    """

    # The number of documents to analyze each iteration
    batchsize = 100
    # The total number of documents in Wikipedia
    D = 1000
    # The number of topics
    K = 100

    # How many documents to look at
    documentstoanalyze = int(D / batchsize)

    # Our vocabulary
    vocab = file('./com_all_words.txt').readlines()
    W = len(vocab)

    # Initialize the algorithm with alpha=1/K, eta=1/K, tau_0=1024, kappa=0.7
    olda = onlineldavb.OnlineLDA(vocab, K, D, 1. / K, 1. / K, 1024., 0.7)
    # Run until we've seen D documents. (Feel free to interrupt *much*
    # sooner than this.)
    for iteration in range(0, documentstoanalyze):
        # Download some articles
        docset = []
        counts = []
        linecache.clearcache()
        startpoint = iteration * batchsize + 1
        # get the paper keywords in batches
        for i in range(batchsize):
            f1 = open('com_all_key.txt', 'r')
            f2 = open('com_all.txt', 'r')
            docset.append(
                linecache.getline('com_all_key.txt', min(D,
                                                         startpoint + i))[:-1])
            counts.append(
                linecache.getline('com_all.txt', min(D, startpoint + i))[:-1])
        # Give them to online LDA

# print docset[0]
        (gamma, bound) = olda.update_lambda(docset, counts)
        # Compute an estimate of held-out perplexity
        (wordids,
         wordcts) = onlineldavb.parse_doc_list(docset, olda._vocab, counts)
        # print [olda._vocab[x] for x in docset[0].split(';')], wordids[0], wordcts[0]
        perwordbound = bound * len(docset) / (D * sum(map(sum, wordcts)))
        print '%d:  rho_t = %f,  held-out perplexity estimate = %f' % \
            (iteration, olda._rhot, numpy.exp(-perwordbound))

        # Save lambda, the parameters to the variational distributions
        # over topics, and gamma, the parameters to the variational
        # distributions over topic weights for the articles analyzed in
        # the last iteration.
        if (iteration % 10 == 0):
            numpy.savetxt('lambda_paper-%d.dat' % iteration, olda._lambda)
            numpy.savetxt('gamma_paper-%d.dat' % iteration, gamma)
Beispiel #7
0
def main():

    # The number of documents to analyze each iteration.
    batchsize = args.batchsize
    # The total number of documents in the corpus.
    D = args.num_docs
    # The number of topics.
    K = args.num_topics

    # How many documents to look at
    documentstoanalyze = int(D / batchsize)

    # The vocabulary

    vocab = file(args.vocab_file).readlines()
    W = len(vocab)

    # Initialize the algorithm with alpha=1/K, eta=1/K, tau_0=1024, kappa=0.7
    alpha = 1. / K  # prior on topic weights theta
    eta = 1. / K  # prior on p(w|topic) Beta
    tau_0 = args.tau_0  # learning parameter to downweight early documents
    kappa = args.kappa  # learning parameter; decay factor for influence of batches
    olda = onlineldavb.OnlineLDA(vocab, K, D, alpha, 1. / K, tau_0, kappa)

    dataset_file = open(args.dataset)
    start = time.time()

    for iteration in range(0, documentstoanalyze):
        # Read a batch of articles.
        docset = batch_read(dataset_file, batchsize)
        # Give them to online LDA
        (gamma, bound) = olda.update_lambda(docset)
        # Compute an estimate of held-out perplexity
        (wordids, wordcts) = onlineldavb.parse_doc_list(docset, olda._vocab)

        # Save lambda, the parameters to the variational distributions
        # over topics, and gamma, the parameters to the variational
        # distributions over topic weights for the articles analyzed in
        # the last iteration.
        if (iteration % 10 == 0):
            i = iteration
            pct = round((i * 1.0 / documentstoanalyze) * 100, 2)
            elapsed = int(time.time() - start)
            Printer(
                "Processed {0} batches. ~ {1}% complete. Elapsed time: {2}s".
                format(i, pct, elapsed))
            if (iteration % args.model_out_freq == 0):
                numpy.savetxt(
                    '{0}lambda-{1}.dat'.format(args.outdir, iteration),
                    olda._lambda)
                numpy.savetxt(
                    '{0}gamma-{1}.dat'.format(args.outdir, iteration), gamma)

    numpy.savetxt('{0}lambda-final.dat'.format(args.outdir), olda._lambda)
    numpy.savetxt('{0}gamma-final.dat'.format(args.outdir), gamma)
Beispiel #8
0
    def __init__(self, K, tau0, kappa):
        # The total number of documents in Wikipedia
        self.D = 330

        # Our vocabulary
        self.vocab = open('dictnostops.txt', 'rt').readlines()
        self.W = len(self.vocab)

        # Initialize the algorithm with alpha=1/K, eta=1/K,
        self.old_alpha = onlineldavb.OnlineLDA(self.vocab, K, self.D, 1. / K,
                                               1. / K, tau0, kappa)

        self.iteration = 0
Beispiel #9
0
def main():
    # The number of documents to analyze each iteration
    batchsize = 100

    # The total number of questions on Stack Overflow
    D = 3.3e6

    # The number of topics
    K = 20

    # Make sure the topics are included as features for analysis
    feature_names.extend('Topic%d' % k for k in range(K))

    print("Reading the vocabulary")
    vocab = [w.strip() for w in file('./vocab4.txt')]

    # How many words are in the vocabulary
    W = len(vocab)

    print("Reading the data")
    data = cu.get_dataframe(train_file)

    # Initialize the algorithm with alpha=1/K, eta=1/K, tau_0=1024, kappa=0.7
    lda = onlineldavb.OnlineLDA(vocab, K, D, 1. / K, 1. / K, 1024., 0.7)

    print("Allocating the topics")
    allocate_topics(lda, data, K, batchsize, D)

    print("Extracting features")
    fea = extract_features(feature_names, data)

    print("Training the model")
    rf = RandomForestClassifier(n_estimators=50,
                                verbose=2,
                                compute_importances=True,
                                n_jobs=4)
    rf.fit(fea, data["OpenStatus"])

    print("Reading test file and making predictions")
    data = cu.get_dataframe(test_file)
    allocate_topics(lda, data, K, batchsize, D)
    test_features = extract_features(feature_names, data)
    probs = rf.predict_proba(test_features)

    print("Calculating priors and updating posteriors")
    new_priors = cu.get_priors(full_train_file)
    old_priors = cu.get_priors(train_file)
    probs = cu.cap_and_update_priors(old_priors, probs, new_priors, 0.001)

    print("Saving submission to %s" % submission_file)
    cu.write_submission(submission_file, probs)
Beispiel #10
0
def main():
    """
    Downloads and analyzes a bunch of random Wikipedia articles using
    online VB for LDA.
    """

    # The number of documents to analyze each iteration
    batchsize = 64
    # The total number of questions on Stack Overflow
    D = 3.3e6
    # The number of topics
    K = 50

    # How many documents to look at
    if (len(sys.argv) < 2):
        documentstoanalyze = int(D / batchsize)
    else:
        documentstoanalyze = int(sys.argv[1])

    # Our vocabulary
    vocab = file('./vocab2.txt').readlines()
    W = len(vocab)

    # Our set of questions from Stack Overflow
    questions = QuestionSet(datafilename)

    # Initialize the algorithm with alpha=1/K, eta=1/K, tau_0=1024, kappa=0.7
    olda = onlineldavb.OnlineLDA(vocab, K, D, 1. / K, 1. / K, 1024., 0.7)
    # Run until we've seen D documents. (Feel free to interrupt *much*
    # sooner than this.)
    print 'processing', documentstoanalyze
    for iteration in range(0, documentstoanalyze):
        # Download some articles
        (docset, articlenames) = questions.get_batch(batchsize)
        # Give them to online LDA
        (gamma, bound) = olda.update_lambda(docset)
        # Compute an estimate of held-out perplexity
        (wordids, wordcts) = onlineldavb.parse_doc_list(docset, olda._vocab)
        perwordbound = bound * len(docset) / (D * sum(map(sum, wordcts)))
        print '%d:  rho_t = %f,  held-out perplexity estimate = %f' % \
            (iteration, olda._rhot, numpy.exp(-perwordbound))

        # Save lambda, the parameters to the variational distributions
        # over topics, and gamma, the parameters to the variational
        # distributions over topic weights for the articles analyzed in
        # the last iteration.
        if (iteration % 10 == 0):
            numpy.savetxt('lambda-%d.dat' % iteration, olda._lambda)
            numpy.savetxt('gamma-%d.dat' % iteration, gamma)
Beispiel #11
0
def main():
    """
    Downloads and analyzes a bunch of random Wikipedia articles using
    online VB for LDA.
    """

    # The number of documents to analyze each iteration
    batchsize = 64
    # The total number of documents in Wikipedia
    D = 3.3e6
    # The number of topics
    K = 100

    # How many documents to look at
    if len(argv) < 2:
        documentstoanalyze = int(D/batchsize)
    else:
        documentstoanalyze = int(argv[1])

    # Our vocabulary
    vocab = file('./dictnostops.txt').readlines()
    W = len(vocab)

    # Initialize the algorithm with alpha=1/K, eta=1/K, tau_0=1024, kappa=0.7
    olda = onlineldavb.OnlineLDA(vocab, K, D, 1./K, 1./K, 1024., 0.7)
    # Run until we've seen D documents. (Feel free to interrupt *much*
    # sooner than this.)
    for iteration in range(0, documentstoanalyze):
        # Download some articles
        docset, articlenames = \
            wikirandom.get_random_wikipedia_articles(batchsize)
        # Give them to online LDA
        gamma, bound = olda.update_lambda(docset)
        # Compute an estimate of held-out perplexity
        wordids, wordcts = onlineldavb.parse_doc_list(docset, olda._vocab)
        perwordbound = bound * len(docset) / (D * sum(map(sum, wordcts)))
        print '%d:  rho_t = %f,  held-out perplexity estimate = %f' % \
            (iteration, olda._rhot, numpy.exp(-perwordbound))

        # Save lambda, the parameters to the variational distributions
        # over topics, and gamma, the parameters to the variational
        # distributions over topic weights for the articles analyzed in
        # the last iteration.
        if iteration % 10 == 0:
            print "Iteration: ", iteration
            numpy.savetxt('lambda.dat', olda._lambda)
            numpy.savetxt('gamma.dat', gamma)
Beispiel #12
0
def main():
    """
    Downloads and analyzes a bunch of random Wikipedia articles using
    online VB for LDA.
    """

    doc_files = sys.argv[1]

    (docset, articlenames) = \
        load_documents(doc_files)

    D = len(docset)

    # of topics
    K = int(sys.argv[2])

    # Our vocabulary
    vocab = file('./dictnostops_test.txt').readlines()
    W = len(vocab)

    # Initialize the algorithm with alpha=1/K, eta=1/K, tau_0=1024, kappa=0.7
    '''kappa set to 0 to eliminate decay'''
    olda = onlineldavb.OnlineLDA(vocab, K, D, 1. / K, 1. / K, 1024., 0)
    # Run until we've seen D documents. (Feel free to interrupt *much*
    # sooner than this.)

    # Give them to online LDA
    (gamma, bound) = olda.update_lambda(docset)
    # Compute an estimate of held-out perplexity
    (wordids, wordcts) = onlineldavb.parse_doc_list(docset, olda._vocab)
    perwordbound = bound * len(docset) / (D * sum(map(sum, wordcts)))
    print '  rho_t = %f,  held-out perplexity estimate = %f' % \
        ( olda._rhot, numpy.exp(-perwordbound))

    # Save lambda, the parameters to the variational distributions
    # over topics, and gamma, the parameters to the variational
    # distributions over topic weights for the articles analyzed in
    # the last iteration.
    print(olda._lambda.shape)
    print(gamma.shape)

    numpy.savetxt('lambda.dat', olda._lambda)
    numpy.savetxt('gamma.dat', gamma)
Beispiel #13
0
def main(num_batches, K):
    """
    Downloads and analyzes a bunch of random Wikipedia articles using
    online VB for LDA.

    Arguments:
    - num_batches: the number of batchs to take corpus_size = num_batches * batch_size
    - K : the number of topics, determined from stdin
    """

    # The number of documents to analyze each iteration
    batchsize = 64
    # The total number of documents in Wikipedia
    D = 3.3e6

    # Our vocabulary
    vocab = file('./dictnostops.txt').readlines()
    W = len(vocab)

    # Initialize the algorithm with alpha=1/K, eta=1/K, tau_0=1024, kappa=0.7
    olda = onlineldavb.OnlineLDA(vocab, K, D, 1. / K, 1. / K, 1024., 0.7)
    # Run until we've seen D documents. (Feel free to interrupt *much*
    # sooner than this.)
    for iteration in range(0, num_batches):
        # Download some articles
        (docset, articlenames) = \
            wikirandom.get_random_wikipedia_articles(batchsize)
        # Give them to online LDA
        (gamma, bound) = olda.update_lambda(docset)
        # Compute an estimate of held-out perplexity
        (wordids, wordcts) = onlineldavb.parse_doc_list(docset, olda._vocab)
        perwordbound = bound * len(docset) / (D * sum(map(sum, wordcts)))
        print '%d:  rho_t = %f,  held-out perplexity estimate = %f' % \
            (iteration, olda._rhot, numpy.exp(-perwordbound))

        # Save lambda, the parameters to the variational distributions
        # over topics, and gamma, the parameters to the variational
        # distributions over topic weights for the articles analyzed in
        # the last iteration.
        if (iteration % 10 == 0):
            numpy.savetxt('lambda-%d.dat' % iteration, olda._lambda)
            numpy.savetxt('gamma-%d.dat' % iteration, gamma)
def main():
    '''
    Read PApers
    '''
    papers_ = []

    with open('papers.csv', 'r') as csvfile:
        for line in csv.reader(csvfile, delimiter=',', quotechar='"'):
            papers_.append(line)

    D = len(papers_)

    # The number of topics
    K = 10

    # Our vocabulary
    vocab = open('./dictnostops.txt').readlines()

    # Initialize the algorithm with alpha=1/K, eta=1/K, tau_0=1024, kappa=0.7
    olda = onlineldavb.OnlineLDA(vocab, K, D, 1. / K, 1. / K, 1024., 0.7)

    docset = [row[3] for row in papers_]
    #articlenames = [row[0] for row in papers_]

    # Give them to online LDA
    (gamma, bound) = olda.update_lambda_docs(docset)
    # Compute an estimate of held-out perplexity
    (wordids, wordcts) = onlineldavb.parse_doc_list(docset, olda._vocab)
    perwordbound = bound * len(docset) / (D * sum(map(sum, wordcts)))
    print('%d:  rho_t = %f,  held-out perplexity estimate = %f' % \
        (1, olda._rhot, numpy.exp(-perwordbound)))

    # Save lambda, the parameters to the variational distributions
    # over topics, and gamma, the parameters to the variational
    # distributions over topic weights for the articles analyzed in
    # the last iteration.

    numpy.savetxt('lambda.dat', olda._lambda)
    numpy.savetxt('gamma.dat', gamma)

    #show topics
    printtopics.main(5)
def main():
    """
	using online VB for LDA on Archive data.
    """

    # The number of documents to analyze each iteration
    batchsize = 1000
    # The total number of documents in Wikipedia
    D = 7000
    # The number of topics
    K = 10
    # How many documents to look at
    documentstoanalyze = int(D/batchsize)
    if (len(sys.argv) > 1):
    	K = int(sys.argv[1])

    # Our vocabulary
    vocab = file(data_dir+'/dictionary_all.txt').readlines()
    W = len(vocab)

    # Initialize the algorithm with alpha=1/K, eta=1/K, tau_0=1024, kappa=0.7
    olda = onlineldavb.OnlineLDA(vocab, K, D, 1./K, 1./K, 1024., 0.7)
    # Run until we've seen D documents. (Feel free to interrupt *much*
    # sooner than this.)
    for iteration in range(0, documentstoanalyze):
        docset= get_abstracts(iteration)
        # Give them to online LDA
        (gamma, bound) = olda.update_lambda(docset)
        # Compute an estimate of held-out perplexity
        (wordids, wordcts) = onlineldavb.parse_doc_list(docset, olda._vocab)
        perwordbound = bound * len(docset) / (D * sum(map(sum, wordcts)))
        print '%d:  rho_t = %f,  held-out perplexity estimate = %f' % \
            (iteration, olda._rhot, numpy.exp(-perwordbound))

        # Save lambda, the parameters to the variational distributions
        # over topics, and gamma, the parameters to the variational
        # distributions over topic weights for the articles analyzed in
        # the last iteration.
    	numpy.savetxt('%darchive-lambda-%d.dat' % (K, iteration), olda._lambda)
    	numpy.savetxt('%darchive-gamma-%d.dat' % (K, iteration), gamma)
def main():
    # LDA: a documents contains all the keywords of some journal/conference
    # equivalent to cluster keywords over journals/conferences
    journal_or_conference = sys.argv[1]
    num = int(sys.argv[2])
    conn = jcke.get_db_conn()

    # default (num <=0 or > max number): figure out all the journals/conferences keywords
    if num <= 0 or (num >= 15151 and journal_or_conference == "journal") or (
            num >= 4545 and journal_or_conference == "conference"):
        query = """
		SELECT COUNT(*) FROM ##journal_or_conference##
		"""
        query = query.replace("##journal_or_conference##",
                              journal_or_conference)
        conn.cursor.execute(query)
        num = conn.cursor.fetchall(
        )  # number of journals/conferences to process

    # document parsing
    journal_conf_list = os.listdir("journal_conf_keyword")
    # check if txt files exist, then generate those docs
    if not num == len(journal_conf_list):
        jcke.journal_conf_keyword_generation(conn, num, journal_or_conference)
        journal_conf_list = os.listdir("journal_conf_keyword")

    # The number of journal/conference keyword sets in each batch
    batchsize = lambda num: num if num <= 100 else 100
    batch = batchsize(num)
    iteration_times = int(num / batch)
    # The total number of journals/conferences
    DocNum = lambda journal_or_conference: 15151 if journal_or_conference == "journal" else 4545
    D = DocNum(journal_or_conference)
    # The number of topics
    K = 100  # maybe some other numbers
    # Our vocabulary : we need some vocabulary set!
    vocab = dict()
    # Initialize the algorithm with alpha=1/K, eta=1/K, tau_0=1024, kappa=0.7
    online_LDA = lda.OnlineLDA(vocab, K, D, 1. / K, 1. / K, 1024., 0.7)

    # Run until we've seen D documents. (Feel free to interrupt *much* sooner than this.)
    for iteration in range(0, iteration_times):
        # getting documents (keyword sets)
        if iteration != iteration_times - 1:
            journal_conf_keyword_list = jcke.input_journal_conf_keywords(
                journal_conf_list[iteration * batch:(iteration + 1) * batch])
        else:
            journal_conf_keyword_list = jcke.input_journal_conf_keywords(
                journal_conf_list[iteration * batch:])
    # online LDA for keyword sets
        # here we update the relative function in the package	(dangerous!)
        online_LDA._vocab = jcke.vocabulary_generation(
            journal_conf_keyword_list, online_LDA._vocab)

        (gamma, bound) = online_LDA.update_lambda(journal_conf_keyword_list)
        # Compute an estimate of held-out perplexity
        (keywordids,
         keywordcts) = lda.parse_doc_list(journal_conf_keyword_list,
                                          online_LDA._vocab)
        perkeywordbound = bound * len(journal_conf_keyword_list) / (
            D * sum(map(sum, keywordcts)))
        print '%d:  rho_t = %f,  held-out perplexity estimate = %f' % \
            (iteration, online_LDA._rhot, numpy.exp(-perkeywordbound))

    # Save lambda, the parameters to the variational distributions over topics, and gamma, the parameters to the variational
    # distributions over topic weights for the articles analyzed in the last iteration.
    numpy.savetxt('lambda-%s.dat' % journal_or_conference, online_LDA._lambda)
    numpy.savetxt('gamma-%s.dat' % journal_or_conference, gamma)
def main():

    # unpack input arguments
    # seednum = 1
    # documentstoanalyze  = 2000
    # batchsize = 10
    # priv = 1
    # epsilon = 1
    # comp = 2
    # mech = 0

    seednum = int(sys.argv[1])
    documentstoanalyze = int(sys.argv[2])
    batchsize = int(sys.argv[3])
    priv = int(sys.argv[4])  # 1 is private version, 0 is nonprivate version
    # epsilon = float(sys.argv[5]) # total privacy budget
    comp = int(sys.argv[5])  #
    mech = int(sys.argv[6])  # 0 for Gaussian, 1 for Laplace

    # The number of topics
    #K = 100
    K = 50  #JF

    # load data
    # the_filename = Data_PATH+'wiki_docsmallset'
    # with open(the_filename, 'rb') as f:
    #     docset = cPickle.load(f)

    #the_filename = Data_PATH+'wiki_docsmallset_D=%s' %(400000)
    the_filename = os.path.join(Data_PATH, 'wiki_docsmallset_D=%s' %
                                (400000))  #JF: Make this work on Windows
    if resampleShortDocs:
        the_filename = the_filename + '_resample_short_docs'
    with open(the_filename, 'rb') as f:
        docset = cPickle.load(f)

    D = len(docset)
    print 'document length: %s' % (D)

    nu = batchsize / float(D)  # sampling rate
    numpy.random.seed(seednum)

    print 'seednum %s mini-batchsize %s and number of iter %s' % (
        seednum, batchsize, documentstoanalyze)

    # Our vocabulary
    vocab = file('./dictnostops.txt').readlines()
    W = len(vocab)
    """ privacy budget calculation """
    # (1) to set the same level of burned privacy, we first calculate MA composition
    #sigma = 1.00000000000000000000000000000000000001 #a small value to minimize the noise
    #sigma = 1.1  #an intermediate value
    sigma = 1.24  #an intermediate value
    #sigma = 1.5  #an intermediate value
    #sigma = 2 #a larger value, expected to substantially reduce privacy and performance.

    total_del = 1e-4
    J = documentstoanalyze
    total_eps_MA = cal_pri.moments_accountant(sigma, total_del, nu, J)
    print 'total privacy loss is %f' % (total_eps_MA)

    #(2) strong composition
    del_iter = 1e-6
    res = minimize_scalar(cal_pri.strong_composition,
                          bounds=(0, 50),
                          args=(total_eps_MA, total_del, J, nu, del_iter),
                          method='bounded')
    eps_iter = res.x

    gamma_noise = 0  # we don't use this at all.

    if comp == 0:  #MA
        c2 = 2 * np.log(1.25 / del_iter)
        eps_iter = np.sqrt(c2) / sigma
        budget = [eps_iter, del_iter]

    elif comp == 1:  #strong composition
        budget = [eps_iter, del_iter]
    else:
        print "we don't support this composition"

    if priv:
        print 'private version'

    olda = onlineldavb.OnlineLDA(vocab, K, D, 1. / K, 1. / K, 1024., 0.7, priv,
                                 budget, gamma_noise, mech)

    perplexity = numpy.zeros(documentstoanalyze)

    # for iteration in range(0, maxIter):
    for iteration in range(0, documentstoanalyze):
        # subset of data
        rand_perm_nums = numpy.random.permutation(len(docset))
        idx_minibatch = rand_perm_nums[0:batchsize]
        docsubset = list(docset[i] for i in idx_minibatch)

        # Give them to online LDA
        (gamma, bound) = olda.update_lambda_docs(docsubset)
        # Compute an estimate of held-out perplexity
        (wordids, wordcts) = onlineldavb.parse_doc_list(docsubset, olda._vocab)
        perwordbound = bound * len(docsubset) / (D * sum(map(sum, wordcts)))
        print '%d:  rho_t = %f,  held-out perplexity estimate = %f' % \
            (iteration, olda._rhot, numpy.exp(-perwordbound))

        perplexity[iteration] = numpy.exp(-perwordbound)

    # save perplexity
    if priv:
        # if gamma_noise:
        #     method = 'private_epsilon_%s_cdp_%s_gamma_noise_%s' % (epsilon, cdp, gamma_noise)
        # else:
        #     method = 'private_epsilon_%s_cdp_%s' %(epsilon, cdp)
        # method = 'static_private_seed=%s_J=%s_S=%s_priv=%s_epsilon=%s_compo=%s_D=%s' %(sys.argv[1], sys.argv[2], sys.argv[3], sys.argv[4], sys.argv[5], sys.argv[6], D)

        #method = Results_PATH+'static_private_seed=%s_J=%s_S=%s_priv=%s_epsilon=%s_compo=%s_Lap=%s_D=%s' %(sys.argv[1], sys.argv[2], sys.argv[3], sys.argv[4], total_eps_MA, sys.argv[5], sys.argv[6], D)
        method = os.path.join(
            Results_PATH,
            'static_private_seed=%s_J=%s_S=%s_priv=%s_epsilon=%s_compo=%s_Lap=%s_D=%s'
            % (sys.argv[1], sys.argv[2], sys.argv[3], sys.argv[4],
               total_eps_MA, sys.argv[5], sys.argv[6], D))

    else:
        #method = Results_PATH+'static_nonprivate_seed=%s_J=%s_S=%s_priv=%s_D=%s' %(sys.argv[1], sys.argv[2], sys.argv[3], sys.argv[4], D)
        method = os.path.join(
            Results_PATH, 'static_nonprivate_seed=%s_J=%s_S=%s_priv=%s_D=%s' %
            (sys.argv[1], sys.argv[2], sys.argv[3], sys.argv[4], D))
    if resampleShortDocs:
        method = method + '_resample_short_docs'
    numpy.save(method + '.npy', perplexity)
    # method = 'private_epsilon_1'
    # filename = method+'_D=_%s_S=_%s' %(D, batchsize)
    # numpy.save(filename+'.npy', test_log_likelihood)

    # save lambda and gamma
    numpy.savetxt(method + '_lambda.dat', olda._lambda)
    numpy.savetxt(method + '_gamma.dat', gamma)
def main():
    """
    Downloads and analyzes a bunch of random Wikipedia articles using
    online VB for LDA.
    """
    comm = MPI.COMM_WORLD
    size = comm.Get_size()
    rank = comm.Get_rank()
    # The number of documents to analyze each iteration
    batchsize = 100
    # The total number of documents in Wikipedia
    D = 1000  #D = 2129792 for the whole set
    # The number of topics
    K = 30

    # Our vocabulary
    vocab = file('./com_all_words.txt').readlines()
    W = len(vocab)

    # Initialize the algorithm with alpha=1/K, eta=1/K, tau_0=1024, kappa=0.7
    olda = onlineldavb.OnlineLDA(vocab, K, D, 1. / K, 1. / K, 1024., 0.7)
    # Run until we've seen D documents. (Feel free to interrupt *much*
    # sooner than this.)
    iteration = 0
    while iteration * batchsize * size <= D:
        # Download some articles
        docset = []
        counts = []
        linecache.clearcache()
        startpoint = iteration * batchsize * size + batchsize * rank + 1
        if startpoint > D:  # search to the end
            break  # stop
# get the paper keywords in batches
        for i in range(batchsize):
            f1 = open('com_all_key.txt', 'r')
            f2 = open('com_all.txt', 'r')
            docset.append(
                linecache.getline('com_all_key.txt', min(D, startpoint))[:-1])
            counts.append(
                linecache.getline('com_all.txt', min(D, startpoint))[:-1])
            startpoint = startpoint + 1
# print type(docset), type(docset[0]), docset[0]
# Give them to online LDA
        (gamma, bound) = olda.update_lambda(docset, counts)
        # Compute an estimate of held-out perplexity
        (wordids,
         wordcts) = onlineldavb.parse_doc_list(docset, olda._vocab, counts)
        # print wordcts[0:5]
        perwordbound = bound * len(docset) / (D * sum(map(sum, wordcts)))
        print '%d:  rho_t = %f,  held-out perplexity estimate = %f' % \
            (iteration, olda._rhot, numpy.exp(-perwordbound))
        iteration = iteration + 1
        # Save lambda, the parameters to the variational distributions
        # over topics, and gamma, the parameters to the variational
        # distributions over topic weights for the articles analyzed in
        # the last iteration.

# print olda._lambda[0]
    gammas = comm.gather(gamma, root=0)
    lambdas = comm.gather(olda._lambda, root=0)
    if rank == 0:
        gamma_result = numpy.vstack((x for x in gammas))
        lambda_result = numpy.vstack((x for x in lambdas))
        numpy.savetxt('lambda_parallel.dat', olda._lambda)
        numpy.savetxt('gamma_parallel.dat', gamma)
Beispiel #19
0
def main():
    """
    Downloads and analyzes a bunch of random Wikipedia articles using
    online VB for LDA.
    """
    wn.ensure_loaded()
    wiki_pool = wiki_local.WikiPool()
    # The number of documents to analyze each iteration
    batchsize = 1
    # The total number of documents in Wikipedia
    D = 3.3e6
    # The number of topics
    K = 30

    # How many documents to look at
    if (len(sys.argv) < 2):
        documentstoanalyze = int(D / batchsize)
    else:
        documentstoanalyze = int(sys.argv[1]) + 1

    # Our vocabulary
    #vocab = file('./dictnostops.txt').readlines()
    #vocab = file('./wordnet_nouns.txt').readlines()
    #vocab = file('./synset_dict.txt').readlines()
    #vocab = file('./wn_ambig_no_stop.txt').readlines()
    vocab = file('./mixed_wn_dict.txt').readlines()
    #vocab = []
    #for word in words.words():
    #    word = str(word).lower()
    #    word = re.sub(r'[^a-z]', '', word)
    #    if word != '':
    #        vocab.append(word)
    ##we get repeats because of upper -> lowercase?
    #vocab = set(vocab)
    #vocab = list(vocab)
    W = len(vocab)
    print W

    # Initialize the algorithm with alpha=1/K, eta=1/K, tau_0=1024, kappa=0.7
    olda = onlineldavb.OnlineLDA(vocab, K, D, 1. / K, 1. / K, 1024., 0.7)

    # Run until we've seen D documents. (Feel free to interrupt *much*
    # sooner than this.)
    for iteration in range(0, documentstoanalyze):
        # Download some articles
        (docset, articlenames) = \
            wiki_pool.get_random_wikipedia_articles(batchsize)
        # Give them to online LDA
        (gamma, bound) = olda.update_lambda_docs(docset)
        # Compute an estimate of held-out perplexity
        (wordids, wordcts) = onlineldavb.parse_doc_list(docset, olda._vocab)
        perwordbound = bound * len(docset) / (D * sum(map(sum, wordcts)))
        print '%d:  rho_t = %f,  held-out perplexity estimate = %f' % \
            (iteration, olda._rhot, numpy.exp(-perwordbound))

        # Save lambda, the parameters to the variational distributions
        # over topics, and gamma, the parameters to the variational
        # distributions over topic weights for the articles analyzed in
        # the last iteration.
        if (iteration % 50 == 0):
            numpy.savetxt(
                'data_ground_truth_disambig/lambda-%d.dat' % iteration,
                olda._lambda)
            numpy.savetxt(
                'data_ground_truth_disambig/gamma-%d.dat' % iteration, gamma)

    numpy.savetxt('data_ground_truth_disambig/lambda-%d.dat' % iteration,
                  olda._lambda)
    numpy.savetxt('data_ground_truth_disambig/gamma-%d.dat' % iteration, gamma)
    print "finished iterations"
    wiki_pool.end()
Beispiel #20
0
def main():
    """
    Downloads and analyzes a bunch of random Wikipedia articles using
    online VB for LDA.
    """
    global D
    global doc_list
    global last_gamma_file
    cut_words()
    print D
    print len(doc_list)
    # The number of documents to analyze each iteration
    batchsize = 64
    # The total number of documents in Wikipedia
    #D = 500
    # The number of topics
    K = int(sys.argv[1])

    # How many documents to look at
    if (len(sys.argv) < 3):
        documentstoanalyze = int(D / batchsize)
    else:
        documentstoanalyze = int(sys.argv[1])

    # Our vocabulary
    vocab = file('./chineseNoStopWords.txt').readlines()
    #print vocab
    W = len(vocab)

    # Initialize the algorithm with alpha=1/K, eta=1/K, tau_0=1024, kappa=0.7
    olda = onlineldavb.OnlineLDA(vocab, K, D, 1. / K, 1. / K, 1024., 0.7)
    # Run until we've seen D documents. (Feel free to interrupt *much*
    # sooner than this.)
    print documentstoanalyze
    perplexity_set = []
    iter_set = []
    for iteration in range(0, documentstoanalyze):
        # Download some articles
        '''
        (docset, articlenames) = \
            wikirandom.get_random_wikipedia_articles(batchsize)
        '''
        docset = doc_list[iteration * batchsize:(iteration + 1) * batchsize]
        # Give them to online LDA
        (gamma, bound) = olda.update_lambda(docset)
        # Compute an estimate of held-out perplexity
        (wordids, wordcts) = onlineldavb.parse_doc_list(docset, olda._vocab)
        perwordbound = bound * len(docset) / (D * sum(map(sum, wordcts)))
        print '%d:  rho_t = %f,  held-out perplexity estimate = %f' % \
            (iteration, olda._rhot, numpy.exp(-perwordbound))
        perplexity_set.append(numpy.exp(-perwordbound))
        iter_set.append(iteration)
        # Save lambda, the parameters to the variational distributions
        # over topics, and gamma, the parameters to the variational
        # distributions over topic weights for the articles analyzed in
        # the last iteration.
        if (iteration % 100 == 0 or iteration == documentstoanalyze - 1):
            numpy.savetxt(
                './res_' + sys.argv[1] + '/lambda-%d.dat' % iteration,
                olda._lambda)
            numpy.savetxt('./res_' + sys.argv[1] + '/gamma-%d.dat' % iteration,
                          gamma)
    last_gamma_file = './res_' + sys.argv[1] + '/lambda-%d.dat' % (
        documentstoanalyze - 1)
    save_lambda_path = 'last_lambda_' + sys.argv[1] + '.txt'
    flast = open(save_lambda_path, 'w')
    flast.write(last_gamma_file)
    flast.close()
Beispiel #21
0
def main():
    # unpack input arguments
    # seednum = 1
    # documentstoanalyze  = 2000
    # batchsize = 1000
    # priv = 0
    # epsilon = 1
    # comp = 2

    seednum = int(sys.argv[1])
    documentstoanalyze = int(sys.argv[2])
    batchsize = int(sys.argv[3])
    priv = int(sys.argv[4])  # 1 is private version, 0 is nonprivate version
    epsilon = float(sys.argv[5])  # total privacy budget
    comp = int(sys.argv[6])  # 0 conventional, 1 advanced, 2 CDP

    # The number of topics
    K = 100
    # D = 1000000
    D = 5000000

    nu = batchsize / float(D)  # sampling rate
    numpy.random.seed(seednum)

    print('seednum %s mini-batchsize %s and number of iter %s' %
          (seednum, batchsize, documentstoanalyze))

    # Our vocabulary
    vocab = file('./dictnostops.txt').readlines()
    W = len(vocab)

    gamma_noise = 0  # will use Laplace noise all the time

    if comp == 2:
        # budget = numpy.sqrt(epsilon/float(documentstoanalyze))
        # budget = numpy.sqrt(epsilon*D/float(2*batchsize))
        budget = numpy.sqrt(2 * epsilon) / float(
            2 * nu * numpy.sqrt(documentstoanalyze))
    elif comp == 1:
        delta = 0.000001
        budget = epsilon / float(
            4 * nu * numpy.sqrt(2 * documentstoanalyze * numpy.log(1 / delta)))
    else:
        # budget = epsilon/float(documentstoanalyze)
        budget = epsilon / float(2 * documentstoanalyze * nu)

    if priv:
        print('private version')

    olda = onlineldavb.OnlineLDA(vocab, K, D, 1. / K, 1. / K, 1024., 0.7, priv,
                                 budget, gamma_noise)

    # the_filename = Data_PATH+'wiki_data'
    # with open(the_filename, 'rb') as f:
    #     docset = cPickle.load(f)

    # load all the documents
    # docset = []
    # for whichdoc in range(1, 21):
    #     the_filename = Data_PATH+'wikidata_seednum=_%s' %(whichdoc)
    #     with open(the_filename, 'rb') as f:
    #         docset1 = cPickle.load(f)
    #         docset = docset + docset1
    #         print "docset %s is loaded" %(whichdoc)
    #
    # print "docset all loaded"

    perplexity = numpy.zeros(documentstoanalyze)
    # D_test = 10000

    # for iteration in range(0, maxIter):
    for iteration in range(0, documentstoanalyze):
        # subset of data
        # rand_perm_nums =  numpy.random.permutation(len(docset))
        # idx_minibatch = rand_perm_nums[0:batchsize]
        # docsubset = list(docset[i] for i in idx_minibatch)

        # Download some articles
        (docset, articlenames) = \
         wikirandom.get_random_wikipedia_articles(batchsize)
        # Give them to online LDA
        (gamma, bound) = olda.update_lambda_docs(docset)
        # Compute an estimate of held-out perplexity
        (wordids, wordcts) = onlineldavb.parse_doc_list(docset, olda._vocab)
        perwordbound = bound * len(docset) / (D * sum(map(sum, wordcts)))
        print('%d:  rho_t = %f,  held-out perplexity estimate = %f' % \
              (iteration, olda._rhot, numpy.exp(-perwordbound)))

        # # Give them to online LDA
        # (gamma, bound) = olda.update_lambda_docs(docsubset)
        # # Compute an estimate of held-out perplexity
        # (wordids, wordcts) = onlineldavb.parse_doc_list(docsubset, olda._vocab)
        # perwordbound = bound * len(docsubset) / (D * sum(map(sum, wordcts)))
        # print '%d:  rho_t = %f,  training perplexity estimate = %f' % \
        #     (iteration, olda._rhot, numpy.exp(-perwordbound))

        # compute test perplexity
        # idx_test = rand_perm_nums[batchsize+1:batchsize+1+D_test]
        # doctest = list(docset[i] for i in idx_test)
        #
        # (gamma_test, ss) = olda.do_e_step_docs(doctest)
        # # Estimate held-out likelihood for current values of lambda.
        # bound_test = olda.approx_bound_docs(doctest, gamma_test)
        # (wordids, wordcts_test) = onlineldavb.parse_doc_list(doctest, olda._vocab)
        #
        # # perwordbound_test = bound_test*D_test / float(D*sum(map(sum, wordcts_test)))
        # perword_test_log_likelihood = bound_test / float(sum(map(sum, wordcts_test)))
        # print '%d:  rho_t = %f,  test perplexity estimate = %f' % \
        #     (iteration, olda._rhot, perword_test_log_likelihood)

        perplexity[iteration] = numpy.exp(-perwordbound)

    # save perplexity
    if priv:
        # if gamma_noise:
        #     method = 'private_epsilon_%s_cdp_%s_gamma_noise_%s' % (epsilon, cdp, gamma_noise)
        # else:
        #     method = 'private_epsilon_%s_cdp_%s' %(epsilon, cdp)
        method = 'private_seed=%s_J=%s_S=%s_priv=%s_epsilon=%s_compo=%s' % (
            sys.argv[1], sys.argv[2], sys.argv[3], sys.argv[4], sys.argv[5],
            sys.argv[6])
    else:
        method = 'Nonprivate_seed=%s_J=%s_S=%s_priv=%s_epsilon=%s_compo=%s' % (
            sys.argv[1], sys.argv[2], sys.argv[3], sys.argv[4], sys.argv[5],
            sys.argv[6])

    numpy.save(method + '.npy', perplexity)
    # method = 'private_epsilon_1'
    # filename = method+'_D=_%s_S=_%s' %(D, batchsize)
    # numpy.save(filename+'.npy', test_log_likelihood)

    # save lambda and gamma
    numpy.savetxt(method + '_lambda.dat', olda._lambda)
    numpy.savetxt(method + '_gamma.dat', gamma)
def main():
    """
    Downloads and analyzes a bunch of random Wikipedia articles using
    online VB for LDA.
    """
    # The number of documents to analyze each iteration
    batch_size = 4

    # Total number of documents in the population. For a fixed corpus,
    # this is the size of the corpus. In the truly online setting

    number_of_documents = 71

    # The number of topics
    number_of_topics = 1

    # establish mysql database connection
    database = MysqlMessager(database="keyword_app")
    sql = "select Abstract from PreprocessedAbstracts;"
    database.excute_sql(sql)
    row_iteration = database.fetch()
    abstracts = [row[0] for row in row_iteration]

    # How many documents to look at
    if len(sys.argv) < 2:
        documents_to_analyze = int(number_of_documents / batch_size)
    else:
        documents_to_analyze = int(sys.argv[1])

    # Our vocabulary
    all_keywords_file_path = "../../keywords/abstract_109.txt"
    with read_pickle_file(all_keywords_file_path) as content:
        vocab = list(content)

    # Initialize the algorithm with alpha=1/K, eta=1/K, tau_0=1024, kappa=0.7
    olda = onlineldavb.OnlineLDA(vocab, number_of_topics, number_of_documents,
                                 1. / number_of_topics, 1. / number_of_topics,
                                 1024., 0.7)

    # Run until we've seen D documents. (Feel free to interrupt *much*
    # sooner than this.)

    for iteration in range(0, documents_to_analyze):

        # set dataset as list that stores all abstracts
        doc_set = abstracts

        # Give them to online LDA
        (gamma, bound) = olda.update_lambda(doc_set)

        # Compute an estimate of held-out perplexity
        (word_ids,
         word_count_times) = onlineldavb.parse_doc_list(doc_set, olda.vocab)

        per_word_bound = bound * len(doc_set) / (
            number_of_documents * sum(map(sum, word_count_times)))
        print '%d:  rho_t = %f,  held-out perplexity estimate = %f' % \
            (iteration, olda.rhot, numpy.exp(-per_word_bound))

        # Save lambda, the parameters to the variational distributions
        # over topics, and gamma, the parameters to the variational
        # distributions over topic weights for the articles analyzed in
        # the last iteration.

        if iteration % 10 == 0:
            numpy.savetxt('lambda-%d.dat' % iteration, olda._lambda)
            numpy.savetxt('gamma-%d.dat' % iteration, gamma)
Beispiel #23
0
def main():
    """
    Analyzes scraped pages using scikit-learn.LDA
    """
    
    # The number of topics
    K = 10
    # no of documents
    D = 300
    n_features = 1000

    # Our vocabulary
    vocab = list(set(file('./vocab').readlines()))
    W = len(vocab)
    
    # Add terms and topics to the DB
    db.init()
    db.add_terms(vocab)
    db.add_topics(K)
    
    olda = onlineldavb.OnlineLDA(vocab, K, D, 1./K, 1./K, 1024., 0.7)

    # grab documents
    ### Load your scraped pages, re-tokenize, and vectorize result.
    docset, docnames = [], []
    for filename in os.listdir(os.getcwd()):
        if filename.endswith('.html'): 
            tree = html.parse(filename)
            try: encoding = tree.xpath('//meta/@charset')[0]
            except IndexError: encoding = 'utf-8'

            with open(filename) as page:
                rawtext = page.read()
                try: rawtext = rawtext.decode(encoding, errors='backslashreplace')
                except TypeError: continue
                # encoding issues, see http://stackoverflow.com/questions/19527279/python-unicode-to-ascii-conversion
                docset += [clean_html(rawtext)]
                docnames += [filename[:-5]]
                if not(len(docset) % 10): print("loaded " + str(len(docset)) + " documents")

    # Give them to online LDA
    # Also computes an estimate of held-out perplexity
    (wordids, wordcts) = onlineldavb.parse_doc_list(docset, olda._vocab)
    (gamma, bound) = olda.update_lambda(wordids, wordcts)

    
    # Arrays for adding batches of data to the DB
    # doc_array = []
    # doc_term_array = []

    # for d in range(len(docnames)):
        # doc_array.append((docnames[d], docset[d]))
    doc_array = zip(docnames, docset)
        
    # Add a batch of docs to the DB; this is the one DB task that is not in
    # the separate DB write thread since later tasks depend on having doc ids.
    # Since writes take so long, this also balaces the two threads time-wise.
    doc_ids = db.add_docs(doc_array)

    doc_topic_array = []
    for d in range(len(gamma)):
        doc_size = len(docset[d])
        for k in range(len(gamma[d])):
            doc_topic_array.append((doc_ids[d], k, gamma[d][k], gamma[d][k]/doc_size))
    db.add_doc_topics(doc_topic_array)

    perwordbound = bound * len(docset) / (D * sum(map(sum, wordcts)))
    print '%d:  rho_t = %f,  held-out perplexity estimate = %f' % \
        (1, olda._rhot, numpy.exp(-perwordbound))

    # Save lambda, the parameters to the variational distributions
    # over topics, and gamma, the parameters to the variational
    # distributions over topic weights for the articles analyzed in
    # the last iteration.
    numpy.savetxt('lambda-%d.dat' % 1, olda._lambda)
    numpy.savetxt('gamma-%d.dat' % 1, gamma)
        
    topic_terms_array = []
    for topic in range(len(olda._lambda)):
        lambda_sum = sum(olda._lambda[topic])
            
        for term in range(len(olda._lambda[topic])):
            topic_terms_array.append((topic, term, olda._lambda[topic][term]/lambda_sum))
    db.update_topic_terms(K, topic_terms_array)
            
    gc.collect() # probably not necesary, but precautionary for long runs
    db.print_task_update()

    # The DB thread ends only when it has both run out of tasks and it has been
    # signaled that it will not be recieving any more tasks
    db.increment_batch_count()
    db.signal_end()
Beispiel #24
0
def main():
    """
    Analyzes specified documents.
    """
    options = parse_args()
    print (options)

    # we assume there exist three files: 
    # a vocab file (corpus_vocab.dat)
    # a training file (corpus_train.dat)
    # a validation file (corpus_test.dat)

    corpus = options.corpus

    # vocab file
    W = len(open(corpus + "_vocab.dat", 'r').readlines())
    #print(open(corpus + "_vocab.dat", 'r').readlines())
    # validation file
    validation_filename = corpus + "_test.dat"

    wikirandom = archived_dataset.Corpus(corpus + "_train.dat") # should be _train.dat
    # else:
    #     import wikirandom

    #load a held-out set
    validation_docs = archived_dataset.loadDocs(validation_filename)
    algorithmname = options.algorithmname

    # the second tells us the batch size
    batchsize = options.batchsize

    # the third tells us a list of number of threads to run. (each will be run sequentially)
    numthreads = options.numthreads

    # number of documents
    trueD = wikirandom._D
    
  
    if(algorithmname == "hbb"):
        if options.D == -1:
            D = trueD # number of documents to know in advance
        else:
            D = options.D


    # #prior for topics (ANDRE: this is now a parameter)
    # eta = 1.
    eta = options.eta
    
    # The total number of documents
    #D = 3.3e6 (used to be number in Wikipedia; now an argument)

    # The number of topics
    K = options.K
    alpha = 1./K #* numpy.ones(K)
    batchsize = options.batchsize
    
    if (algorithmname == "hdp_filtering"):
        alg = filtering.HDPFiltering(W,eta, options.max_iters,options.threshold*1E-6, T = 300, K = 30)

    if (algorithmname == "ss"):
        if (numthreads == 1):
            alg = filtering.Filtering(W, K, alpha, eta, 1, True, 0.1) # note: last two args shouldn't matter
        else:
			# NOT REALLY SUPPORTED!
            alg =  parallelfiltering.ParallelFiltering(W, K, alpha, eta, 1, 0.1,True,options.numthreads)
			
    if (algorithmname == "filtering"):
        #maxiters = 15
        if (numthreads == 1):
            alg = filtering.Filtering(W, K, alpha, eta, options.max_iters, options.useHBBBound, options.threshold)
        else:
            if (options.async):
                alg = asynchronous.ParallelFiltering(W, K, alpha, eta, options.max_iters, options.threshold, options.useHBBBound, options.batchsize, options.numthreads)
 
                batchsize = batchsize * options.async_batches_per_eval * options.numthreads
            else:
                alg =  parallelfiltering.ParallelFiltering(W, K, alpha, eta, options.max_iters, options.threshold, options.useHBBBound, options.numthreads, options.batchsize)
                batchsize = batchsize * options.numthreads

    if (algorithmname == "hbb"):
        #default: tau0 = 1024; kappa = 0.7
        # paper says: kappa = 0.5; tau0 = 64; S (minibatch size) = 4096
        # alg = onlineldavb.OnlineLDA(W, K, D, alpha, 1./K, options.tau0, options.kappa)  # the original code for NIPS submission, eta = 1/K
        alg = onlineldavb.OnlineLDA(W, K, D, alpha, eta, options.tau0, options.kappa)

    # EP for LDA
    if (algorithmname == "filtering_ep"):
        if (numthreads == 1):
            alg = filtering.FilteringEP(W, K, alpha, eta, options.max_iters, options.threshold, options.useNewton)
        else:
            if (options.async):
                alg = asynchronous.ParallelFilteringEP(W, K, alpha, eta, options.max_iters, options.threshold, options.useNewton, options.batchsize, options.numthreads)
                batchsize = batchsize * options.async_batches_per_eval * options.numthreads
            else:
                alg = parallelfiltering.ParallelFilteringEP(W, K, alpha, eta, options.max_iters, options.threshold, options.useNewton, options.numthreads, options.batchsize)
                batchsize = batchsize * options.numthreads

    # Fake EP for LDA (?) -- to be removed eventually since it's worse than true EP
    if (algorithmname == "filtering_ep2"):
        if (numthreads == 1):
            alg = filtering.FilteringEP2(W, K, alpha, eta, options.max_iters, options.threshold, options.useNewton)
        else:
            if (options.async):
                alg = asynchronous.ParallelFilteringEP2(W, K, alpha, eta, options.max_iters, options.threshold, options.useNewton, options.batchsize, options.numthreads)
                batchsize = batchsize * options.async_batches_per_eval * options.numthreads
            else:
                alg = parallelfiltering.ParallelFilteringEP2(W, K, alpha, eta, options.max_iters, options.threshold, options.useNewton, options.numthreads, options.batchsize)
                batchsize = batchsize * options.numthreads
    

    # Specify the minimum number of points to be processed before we run the evaluation code, since evaluation is expensive
    minNumPtsPerEval = options.minNumPtsPerEval
    expGrowthEval = options.expGrowthEval
    if (minNumPtsPerEval <= 0):
        if (corpus == "nature"):  # 351K docs
            minNumPtsPerEval = 512 #1e3
        elif (corpus == "wiki"):  # 3.6M docs
            #minNumPtsPerEval = 512 #1e3 #2e4
            minNumPtsPerEval = 2  # for toy wiki dataset
        else:
            minNumPtsPerEval = int(trueD / 1000)

    print ("Using algorithm: " + str(alg))
    recordedData = []
    totalTime = 0.0
    totalDownloadingTime = 0.0
    iters = int(trueD / batchsize) + 1
    #print(iters, batchsize, trueD)
    numPtsProc = 0  # number of points processed since last evaluation
    for iteration in range(iters):
        # Get some articles
        start = time.time()
        docset = wikirandom.get_random_docs(batchsize)
        totalDownloadingTime += time.time() - start
        start = time.time()
        (alg_alpha, alg_lam) = alg.update_lambda(docset)
        iter_time = time.time() - start
        totalTime += iter_time
        numPtsProc += batchsize  # we have processed this many more points
        if (numPtsProc >= minNumPtsPerEval or iteration == iters-1):  # evaluate if we have processed enough points, or this is the last iteration
            numPtsProc = 0  # reset the counter
            # The following is just the usual evaluation code from before
            start = time.time()
            (perplex, split) = evaluation.evaluate(validation_docs, alg_alpha, alg_lam, options.usePtEst)
            testTime = time.time() - start
            print (str(iteration+1) + "/" + str(iters) + " " + str(alg) + " (%g, %g): held-out perplexity estimate = %f, %f" % (iter_time, testTime, perplex, split))
            recordedData += [((iteration+1)*batchsize, totalTime, totalDownloadingTime, perplex, split)]  # also save perplexity now!
            if (algorithmname in ["hbb", "filtering", "filtering_ep", "filtering_ep2"]):
    	        outfile = corpus + "_" + str(alg) + "_" + str(batchsize) + "_eta" + str(eta)  # need to distinguish eta now
            else:
    	        outfile = corpus + "_" + algorithmname + "_" + str(options.batchsize) + "_" + str(options.numthreads) + "_eta" + str(eta)
            numpy.save(outfile, recordedData)

            if (expGrowthEval):
				# double the number of points to the next evaluation
    	        minNumPtsPerEval = minNumPtsPerEval * 2
        else:
            print (str(iteration+1) + "/" + str(iters) + " " + str(alg) + " (%g)" % (iter_time))

        if (iteration == iters-1):
            # save final lambda matrix
            if (algorithmname in ["hbb", "filtering", "filtering_ep", "filtering_ep2"]):
                topics_outfile = "topics_" + corpus + "_" + str(alg) + "_" + str(batchsize) + "_eta" + str(eta)  # need to distinguish eta now
            else:
                topics_outfile = "topics_" + corpus + "_" + algorithmname + "_" + str(options.batchsize) + "_" + str(options.numthreads)
            numpy.save(topics_outfile, alg_lam)

	# asynchronous filtering needs to terminate its workers
    if (algorithmname == "filtering"):
        if (numthreads > 1):
            if (options.async):
                alg.shutdown()

    print ("DONE!")
Beispiel #25
0
def fit_olda(parse, doc_path, doc_file, vocab_file, outdir, K, batch_size, \
    iterations, verbose_topics, anchors, tmv_pickle, lemmatize, final_pass, \
    full_doc_topics):
    """
    Analyzes a set of documents using online VB for LDA.
    """
    # instance to generate radom documents
    if parse == "live":  # read and parse docs on the fly using vocab
        docgen = generalrandom.LiveparseDocGen(doc_path)
    else:  # alternative: preparsed
        docgen = generalrandom.PreparseDocGen(doc_file)

    # The total number of documents in Wikipedia
    D = docgen.getDocCount()
    if iterations == 0:
        iterations = max(D / batch_size, 10)

    # Our vocabulary
    if parse == "live" or verbose_topics:
        vocab = [term.strip() for term in file(vocab_file).readlines()]
        W = len(vocab)
    else:
        W = docgen.getTermCount()
        vocab = ["term " + str(w) for w in range(W)]

    # write out general settings to pickle file for use by TMV later
    if tmv_pickle:
        # save model settings: vocab, K, docgen
        f = open(join(outdir, 'settings.pickle'), 'w+')
        cPickle.dump((vocab, K, docgen, lemmatize), f)
        f.close()

    # Initialize the algorithm with alpha=1/K, eta=1/K, tau_0=1024, kappa=0.7
    olda = onlineldavb.OnlineLDA(vocab, K, D, 1./K, 1./K, 1024., 0.7, anchors, \
        lem = lemmatize, preparsed = (parse == "preparsed"))
    # Run until we've seen D documents. (Feel free to interrupt *much*
    # sooner than this.)

    iteration = 0
    old_perplexity = 1.0 * sys.maxint
    delta_perplexity = 1.0 * sys.maxint
    delta_perplexities = [old_perplexity] * 10
    logfile = open(join(outdir, 'log.out'), 'w+')

    while iteration < iterations and sum(
            delta_perplexities
    ) / 10 > 0.001:  # 0.1% change in sample perplexity

        iter_start = time.time()

        # Download some articles
        docset = docgen.get_random_articles(batch_size)

        # Give them to online LDA
        (gamma, bound) = olda.update_lambda(docset)

        # Compute an estimate of held-out perplexity
        if parse == "live":
            (wordids, wordcts) = onlineldavb.parse_doc_list(docset, \
                olda._vocab, lemmatize)
        else:
            (wordids, wordcts) = docset

        # estimate perpexity with the current batch
        perwordbound = bound * len(docset) / (D * sum(map(sum, wordcts)))
        perplexity = numpy.exp(-perwordbound)
        delta_perplexity = abs(old_perplexity - perplexity) / perplexity
        print '%d:  rho_t = %f,  held-out perplexity estimate = %f (%.2f%%)' % \
            (iteration, olda._rhot, perplexity, delta_perplexity * 100)
        logfile.write(
            '%d:  rho_t = %f,  held-out perplexity estimate = %f (%.2f%%)\n' %
            (iteration, olda._rhot, perplexity, delta_perplexity * 100))
        old_perplexity = perplexity
        delta_perplexities.pop(0)
        delta_perplexities.append(delta_perplexity)

        # Save lambda, the parameters to the variational distributions
        # over topics, and gamma, the parameters to the variational
        # distributions over topic weights for the articles analyzed in
        # the last iteration.
        if (iteration % 10 == 0):
            numpy.savetxt(join(outdir, 'lambda-%d.dat' % iteration), \
                olda._lambda)
            numpy.savetxt(join(outdir, 'gamma-%d.dat' % iteration), gamma)

            if verbose_topics:
                print_topics(K, 7, vocab, olda._lambda, anchors)

        iteration += 1

    logfile.close()

    if tmv_pickle:
        f = open(join(outdir, 'olda.pickle'), 'w+')
        cPickle.dump(olda, f)
        f.close()

    # save final iters
    numpy.savetxt(join(outdir, 'lambda-final.dat'), olda._lambda)
    numpy.savetxt(join(outdir, 'gamma-%d.dat' % iteration), gamma)

    # do a final pass on all documents
    if (final_pass):
        fout = open(join(outdir, "gamma-final.dat"), 'w+')
        if not full_doc_topics:
            fout.write("doc.lda.id\ttopic.id\tscore\n")

        i = 0
        for doc in docgen:
            if parse == 'live':  #TODO: the parsers should return same order...
                doc = doc[1]
            (gamma, ss) = olda.do_e_step(doc)
            j = 0
            if not full_doc_topics:
                for g in gamma.tolist()[0]:
                    if g > 0.051:
                        fout.write("%d\t%d\t%f\n" % (i, j, g))
                    j += 1
                i += 1
            else:
                gf = gamma.tolist()[0]
                fout.write(('\t'.join(["%f"] * len(gf)) + '\n') % tuple(gf))
        fout.close()
def main():
    """
    Test function for Online LDA using Variational Bayes
    """

    # The number of documents to analyze each iteration
    batchsize = 4

    # The total number of documents (or an estimate of all docs)
    D = 16

    # The number of topics
    K = 3

    # How many documents to look at
    if (len(sys.argv) < 2):
        num_iters = int(D / batchsize)
    else:
        num_iters = int(sys.argv[1])

    # Our vocabulary
    #vocab = file('./dictnostops.txt').readlines()
    #W = len(vocab)

    print "num_iters: %s " % num_iters

    QSR_vectors = cPickle.load(open("Data/feature_space.p", "rb"))
    QSR_codebook = cPickle.load(open("Data/code_book.p", "rb"))
    codebook_len = len(QSR_codebook)

    # Initialize the algorithm with alpha=1/K, eta=1/K, tau_0=1024, kappa=0.7
    olda = onlineldavb.OnlineLDA(QSR_codebook, K, D, 1. / K, 1. / K, 1., 0.7)

    # Run until we've seen D documents.
    for iteration in range(0, num_iters):
        print "it: %s. start: %s. end: %s" % (iteration, iteration * batchsize,
                                              (iteration + 1) * batchsize)
        # Download some articles
        #(docset, articlenames) = wikirandom.get_random_wikipedia_articles(batchsize)
        # Give them to online LDA
        docset = QSR_vectors[iteration * batchsize:(iteration + 1) * batchsize]

        print "size of docset: %s" % len(docset)

        wordids = []
        wordcts = []

        for cnt, v in enumerate(docset):
            print "\n cnt: ", cnt

            nonzeros = numpy.nonzero(v)
            available_features = nonzeros

            wordids.append(available_features)
            feature_counts = v[nonzeros]
            wordcts.append(feature_counts)

            print "v ", v
            print "avail features %s, feature_cnts: %s" % (available_features,
                                                           feature_counts)

        print "wordids %s, wordcts: %s" % (wordids, wordcts)

        (gamma, bound) = olda.update_lambda(wordids, wordcts)
        # Compute an estimate of held-out perplexity

        perwordbound = bound * len(docset) / (D * sum(map(sum, wordcts)))
        print '%d:  rho_t = %f,  held-out perplexity estimate = %f' % \
            (iteration, olda._rhot, numpy.exp(-perwordbound))

        # Save lambda, the parameters to the variational distributions
        # over topics, and gamma, the parameters to the variational
        # distributions over topic weights for the articles analyzed in
        # the last iteration.
        if (iteration % 1 == 0):
            numpy.savetxt('Data/lambda-%d.dat' % iteration, olda._lambda)
            numpy.savetxt('Data/gamma-%d.dat' % iteration, gamma)
Beispiel #27
0
def main():
    """
    Downloads and analyzes a bunch of random Wikipedia articles using
    online VB for LDA.
    """

    # The number of documents to analyze each iteration
    batchsize = 100
    # The total number of documents in Wikipedia
    D = 3.3e6
    # The number of topics
    K = 100
    rho_t_vector = []
    perplexity_vector = []
    time_vector = []
    time1_vector = []

    # How many documents to look at
    if (len(sys.argv) < 2):
        documentstoanalyze = int(D / batchsize)
    else:
        documentstoanalyze = int(sys.argv[1])

    # Our vocabulary
    vocab = file('./dictnostops.txt').readlines()
    W = len(vocab)

    # Initialize the algorithm with alpha=1/K, eta=1/K, tau_0=1024, kappa=0.7

    kappa = 0.7
    olda = onlineldavb.OnlineLDA(vocab, K, D, 1. / K, 1. / K, 1024., kappa)
    # Run until we've seen D documents. (Feel free to interrupt *much*
    # sooner than this.)
    t1 = time.time()
    for iteration in tqdm(range(0, documentstoanalyze)):
        # Download some articles
        (docset, articlenames) = \
            wikirandom.get_random_wikipedia_articles(batchsize)
        # Give them to online LDA
        (gamma, bound) = olda.update_lambda_docs(docset)
        # Compute an estimate of held-out perplexity
        t = time.time()
        (wordids, wordcts) = onlineldavb.parse_doc_list(docset, olda._vocab)
        perwordbound = bound * len(docset) / (D * sum(map(sum, wordcts)))
        print '%d:  rho_t = %f,  held-out perplexity estimate = %f' % \
            (iteration, olda._rhot, numpy.exp(-perwordbound))
        t2 = time.time()
        time_vector.append(t2 - t1)
        if len(time1_vector) == 0:
            time1_vector.append(t2 - t)
        else:
            time1_vector.append(time1_vector[-1] + t2 - t)
        rho_t_vector.append(olda._rhot)
        perplexity_vector.append(perwordbound)

        # Save lambda, the parameters to the variational distributions
        # over topics, and gamma, the parameters to the variational
        # distributions over topic weights for the articles analyzed in
        # the last iteration.
        if (iteration % 10 == 0):
            numpy.savetxt('lambda-%d.dat' % iteration, olda._lambda)
            numpy.savetxt('gamma-%d.dat' % iteration, gamma)

        numpy.savetxt('time_%.1f_%d' % (kappa, batchsize),
                      numpy.array(time_vector))
        numpy.savetxt('rho_%.1f_%d' % (kappa, batchsize),
                      numpy.array(rho_t_vector))
        numpy.savetxt('perplexity_%.1f_%d' % (kappa, batchsize),
                      numpy.array(perplexity_vector))
        numpy.savetxt('time1_%.1f_%d' % (kappa, batchsize),
                      numpy.array(time1_vector))
Beispiel #28
0
def main(argv):

    doc_list = []

    argList = handleArgs(argv)
    #list the docs in pickledDocs folder
    p = "../data/pickledDocs/"
    l = listdir(p)
    fileList = [p + f for f in l]

    #for each pickled doclist, append all docs to master doclist
    for fi in fileList:
        with open(fi, 'rb') as d:
            docs = cPickle.load(d)
        for k, x in docs.iteritems():
            doc_list.append(x)
        print len(doc_list)

    #D is total number of docs to show to the model, K is number of topics
    goal_its = 80  #number of iterations to run LDA
    corp_size = len(doc_list)  #number of documents in the corpus
    D = corp_size * goal_its  #number of documents expected to see
    K = 10  #default topic value, if none given in parameters
    saveModel = False  #whether to save LDA model itself
    desc = ""  #for performing non-standard runs
    version = ""  #for having multiple models with same parameters
    hyper_param = ""  #for testing hyperparameters

    #define the vocabulary file we will be using
    vocab = helper_funcs.read_dict("../data/dictionary.txt")  #default dict

    #initialize an instance of the OnlineLDA algorithm
    #parameters - dictionary, num topics, learning rate, beta, tau, kappa
    #if the path to an OnlineLDA pickle is passed, it re-opens that pickle

    K = int(argList[0])
    vocab = vocab = str.split(file(argList[1]).read())
    if not (argList[2] is None):
        alpha = argList[2]
    else:
        alpha = 0.1
    if not (argList[3] is None):
        beta = argList[3]
    else:
        beta = 1.

    saveModel = False
    lda = onlineldavb.OnlineLDA(vocab, K, D, alpha, beta, 1024, 0.)
    print "created LDA with parameters:\nnumwords: " + str(
        len(vocab)) + "\n#topics: " + str(K) + "\nalpha: " + str(
            alpha) + "\nbeta: " + str(beta)

    paramTitle = hyper_param + str(
        len(vocab) / 1000) + "kwords_" + str(K) + "topics"

    folder = "../data/out/models/" + paramTitle
    if not isdir(folder):
        mkdir(folder)

    W = len(vocab)

    print "dictionary size: " + str(W)
    print paramTitle

    print folder
    #if desc.find("label") > -1:
    #    with open("../data/out/past_models/"+paramTitle+"/dictionary.txt",'wb') as f:
    #        voc = sorted(vocab.items(),key=operator.itemgetter(1))
    #        for x in voc:
    #            f.write(x[0]+"\n")
    #perform LDA on the document list for goal_its iterations, updating lambda
    for i in range(lda._updatect, goal_its):
        print doc_list
        print i
        (gamma, bound) = lda.update_lambda(doc_list)

        (wordids, wordcts) = onlineldavb.parse_doc_list(doc_list, lda._vocab)
        perwordbound = bound * len(doc_list) / (D * sum(map(sum, wordcts)))
        print np.exp(-perwordbound)

        #pickle the model and its output occasionally
        if (i + 1) == goal_its:
            if not isdir(folder):
                mkdir(folder)
            with open(folder + "/gamma.pickle", 'wb') as f:
                cp2 = cPickle.Pickler(f)
                cp2.dump(gamma)
            with open(folder + "/lambda.pickle", 'wb') as f:
                cp = cPickle.Pickler(f)
                cp.dump(lda._lambda)
            np.savetxt(folder + '/lambda.dat', lda._lambda)

            if saveModel:

                with open(folder + "/LDA.pickle", 'wb') as f:
                    cp3 = cPickle.Pickler(f)
                    cp3.dump(lda)
Beispiel #29
0
def main():
    """
    Downloads and analyzes a bunch of random Wikipedia articles using
    online VB for LDA.
    """

    # The number of documents to analyze each iteration
    batchsize = 64
    # The total number of documents in Wikipedia
    D = 3.3e6
    # The number of topics
    K = 100

    # How many documents to look at
    if (len(sys.argv) < 2):
        documentstoanalyze = int(D/batchsize)
    else:
        documentstoanalyze = int(sys.argv[1])

    # Our vocabulary
    vocab = file('./dictnostops.txt').readlines()
    W = len(vocab)
    
    # Add terms and topics to the DB
    db.init()
    db.add_terms(vocab)
    db.add_topics(K)
    
    # Initialize the algorithm with alpha=1/K, eta=1/K, tau_0=1024, kappa=0.7
    olda = onlineldavb.OnlineLDA(vocab, K, D, 1./K, 1./K, 1024., 0.7)
    # Run until we've seen D documents. (Feel free to interrupt *much*
    # sooner than this.)
    for iteration in range(0, documentstoanalyze):
        # Download some articles
        (docset, articlenames) = \
            wikirandom.get_random_wikipedia_articles(batchsize)
        
        # Give them to online LDA
        (gamma, bound) = olda.update_lambda(docset)
        
        # Compute an estimate of held-out perplexity
        (wordids, wordcts) = onlineldavb.parse_doc_list(docset, olda._vocab)
        
        # Arrays for adding batches of data to the DB
        doc_array = []
        doc_term_array = []
        
        for d in range(len(articlenames)):
            doc_array.append((articlenames[d], docset[d]))
        
        # Add a batch of docs to the DB; this is the one DB task that is not in
        # the separate DB write thread since later tasks depend on having doc ids.
        # Since writes take so long, this also balaces the two threads time-wise.
        doc_ids = db.add_docs(doc_array)
	
        doc_topic_array = []
        for d in range(len(gamma)):
            doc_size = len(docset[d])
            for k in range(len(gamma[d])):
                doc_topic_array.append((doc_ids[d], k, gamma[d][k], gamma[d][k]/doc_size))
        db.add_doc_topics(doc_topic_array)

        perwordbound = bound * len(docset) / (D * sum(map(sum, wordcts)))
        print '%d:  rho_t = %f,  held-out perplexity estimate = %f' % \
            (iteration, olda._rhot, numpy.exp(-perwordbound))

        # Save lambda, the parameters to the variational distributions
        # over topics, and gamma, the parameters to the variational
        # distributions over topic weights for the articles analyzed in
        # the last iteration.
        if (iteration % 10 == 0):
            numpy.savetxt('lambda-%d.dat' % iteration, olda._lambda)
            numpy.savetxt('gamma-%d.dat' % iteration, gamma)
            
            topic_terms_array =[]
            for topic in range(len(olda._lambda)):
                lambda_sum = sum(olda._lambda[topic])
                
                for term in range(len(olda._lambda[topic])):
                    topic_terms_array.append((topic, term, olda._lambda[topic][term]/lambda_sum))
            db.update_topic_terms(K, topic_terms_array)
                
            gc.collect() # probably not necesary, but precautionary for long runs
            db.print_task_update()
        db.increment_batch_count()
    
    # The DB thread ends only when it has both run out of tasks and it has been
    # signaled that it will not be recieving any more tasks
    db.signal_end()
Beispiel #30
0
    #Our vocabulary
    vocab = file(sys.argv[1] + '.dict').readlines()
    vocab = [item.rstrip() for item in vocab]
    W = len(vocab)

    #Read in Abstracts
    pages = file(sys.argv[1] + '.corpus').readlines()
    pages = [strs.rstrip() for strs in pages]
    D = len(pages)

    pageID = range(0, D)
    nBatches = D / batchsize

    #Initialize the algorithm with alpha=1/K, eta=1/K, tau_0=1024, kappa=0.7
    lda = onlineldavb.OnlineLDA(vocab, K, D, 1. / K, 1. / K, 128., 0.7)

    #Run
    nBatches = 100
    for iteration in range(0, nBatches):

        #Grab Abstracts
        (docset, pagenames, pages,
         pageID) = grabAbstracts(pages, batchsize, pageID)

        #Give them to online LDA
        (gamma, bound) = lda.update_lambda(docset)

        #Compute an estimate of held-out perplexity
        (wordids, wordcts) = onlineldavb.parse_doc_list(docset, lda._vocab)
        perwordbound = bound * len(docset) / (D * sum(map(sum, wordcts)))