def main(): import optparse import vocabulary parser = optparse.OptionParser() parser.add_option("-f", dest="filename", help="corpus filename") parser.add_option("-c", dest="corpus", help="using range of Brown corpus' files(start:end)") parser.add_option("--alpha", dest="alpha", type="float", help="parameter alpha", default=0.5) parser.add_option("--beta", dest="beta", type="float", help="parameter beta", default=0.5) parser.add_option("-k", dest="K", type="int", help="number of topics", default=20) parser.add_option("-i", dest="iteration", type="int", help="iteration count", default=100) parser.add_option("-s", dest="smartinit", action="store_true", help="smart initialize of parameters", default=False) parser.add_option("--stopwords", dest="stopwords", help="exclude stop words", action="store_true", default=False) parser.add_option("--seed", dest="seed", type="int", help="random seed") parser.add_option("--df", dest="df", type="int", help="threshold of document freaquency to cut words", default=0) (options, args) = parser.parse_args() if not (options.filename or options.corpus): parser.error("need corpus filename(-f) or corpus range(-c)") if options.filename: corpus = vocabulary.load_file(options.filename) else: corpus = vocabulary.load_corpus(options.corpus) if not corpus: parser.error("corpus range(-c) forms 'start:end'") if options.seed != None: numpy.random.seed(options.seed) voca = vocabulary.Vocabulary(options.stopwords) docs = [voca.doc_to_ids(doc) for doc in corpus] if options.df > 0: docs = voca.cut_low_freq(docs, options.df) lda = LDA(options.K, options.alpha, options.beta, docs, voca.size(), options.smartinit) #import cProfile #cProfile.runctx('lda_learning(lda, options.iteration, voca)', globals(), locals(), 'lda.profile') lda_learning(lda, options.iteration, voca)
def main(): import optparse import vocabulary import lda import lda_cvb0 parser = optparse.OptionParser() parser.add_option("-c", dest="corpus", help="using range of Brown corpus' files(start:end)", default="0:100") parser.add_option("--alpha", dest="alpha", type="float", help="parameter alpha", default=0.5) parser.add_option("--beta", dest="beta", type="float", help="parameter beta", default=0.5) parser.add_option("-k", dest="K", type="int", help="number of topics", default=20) parser.add_option("-i", dest="iteration", type="int", help="iteration count", default=100) parser.add_option("--seed", dest="seed", type="int", help="random seed") parser.add_option("--stopwords", dest="stopwords", help="exclude stop words", action="store_true", default=False) parser.add_option("--df", dest="df", type="int", help="threshold of document freaquency to cut words", default=10) (options, args) = parser.parse_args() corpus = vocabulary.load_corpus(options.corpus) voca = vocabulary.Vocabulary(options.stopwords) docs = [voca.doc_to_ids(doc) for doc in corpus] if options.df > 0: docs = voca.cut_low_freq(docs, options.df) train_docs = [[x for i, x in enumerate(doc) if i % 10 != 0] for doc in docs] test_docs = [[x for i, x in enumerate(doc) if i % 10 == 0] for doc in docs] test_docs_wf = conv_word_freq(test_docs) f = FileOutput("lda_test2") f.out("corpus=%d, words=%d, K=%d, a=%f, b=%f" % (len(docs), len(voca.vocas), options.K, options.alpha, options.beta)) lda_learning(f, lda_cvb0.LDA_CVB0, False, options, train_docs, test_docs_wf, voca) lda_learning(f, lda_cvb0.LDA_CVB0, True, options, train_docs, test_docs_wf, voca) lda_learning(f, lda.LDA, False, options, train_docs, test_docs, voca, 2) lda_learning(f, lda.LDA, True, options, train_docs, test_docs, voca, 2)
def getVocabulary(): import vocabulary import random data_path = "/media/University/UniversityDisc/2-Master/MasterThesis/EjecucionTesis/Desarrollo/PythonProjects/Data/" filename = data_path + "aolGoals.txt" stopwords=True df=0 #(options, args) = parser.parse_args() if not (filename or corpus): "need corpus filename(-l) or corpus range(-l)" if filename: corpus = vocabulary.load_file(filename) else: corpus = vocabulary.load_corpus(corpus) if not corpus: "corpus range(-l) forms 'start:end'" cp = list(corpus) random.seed(326719) random.shuffle(cp) p = int(len(cp) * .7) cp_train = cp[0:p] cp_test = cp[p:] print "Corpus to Test:", len(cp_test) print "Corpus to Train:", len(cp_train) voca = vocabulary.Vocabulary(stopwords) docs = [voca.doc_to_ids(doc) for doc in cp_train] if df > 0: docs = voca.cut_low_freq(docs, df) return voca, docs, cp_train
def main(): import optparse import vocabulary parser = optparse.OptionParser() parser.add_option("-f", dest="filename", help="corpus filename") parser.add_option("-c", dest="corpus", help="using range of Brown corpus' files(start:end)") parser.add_option("--alpha", dest="alpha", type="float", help="parameter alpha", default=0.5) parser.add_option("--beta", dest="beta", type="float", help="parameter beta", default=0.5) parser.add_option("-k", dest="K", type="int", help="number of topics", default=20) parser.add_option("-i", dest="iteration", type="int", help="iteration count", default=100) parser.add_option("-s", dest="smartinit", action="store_true", help="smart initialize of parameters", default=False) parser.add_option("--stopwords", dest="stopwords", help="exclude stop words", action="store_true", default=False) parser.add_option("--seed", dest="seed", type="int", help="random seed") parser.add_option("--df", dest="df", type="int", help="threshold of document freaquency to cut words", default=0) (options, args) = parser.parse_args() if not (options.filename or options.corpus): parser.error("need corpus filename(-f) or corpus range(-c)") if options.filename: corpus = vocabulary.load_file(options.filename) else: corpus = vocabulary.load_corpus(options.corpus) if not corpus: parser.error("corpus range(-c) forms 'start:end'") if options.seed != None: numpy.random.seed(options.seed) voca = vocabulary.Vocabulary(options.stopwords) docs = [voca.doc_to_ids(doc) for doc in corpus] if options.df > 0: docs = voca.cut_low_freq(docs, options.df) lda = LDA(options.K, options.alpha, options.beta, docs, voca.size(), options.smartinit) print "corpus=%d, words=%d, K=%d, a=%f, b=%f" % (len(corpus), len(voca.vocas), options.K, options.alpha, options.beta) #import cProfile #cProfile.runctx('lda_learning(lda, options.iteration, voca)', globals(), locals(), 'lda.profile') lda_learning(lda, options.iteration, voca)
def main(): import optparse import vocabulary import lda import lda_cvb0 parser = optparse.OptionParser() parser.add_option("-c", dest="corpus", help="using range of Brown corpus' files(start:end)", default="1:100") parser.add_option("--alpha", dest="alpha", type="float", help="parameter alpha", default=0.5) parser.add_option("--beta", dest="beta", type="float", help="parameter beta", default=0.5) parser.add_option("-k", dest="K", type="int", help="number of topics", default=20) parser.add_option("-i", dest="iteration", type="int", help="iteration count", default=100) parser.add_option("--seed", dest="seed", type="int", help="random seed") parser.add_option("--stopwords", dest="stopwords", help="exclude stop words", action="store_true", default=False) parser.add_option("--df", dest="df", type="int", help="threshold of document freaquency to cut words", default=1) (options, args) = parser.parse_args() corpus = vocabulary.load_corpus(options.corpus) voca = vocabulary.Vocabulary(options.stopwords) docs = [voca.doc_to_ids(doc) for doc in corpus] if options.df > 0: docs = voca.cut_low_freq(docs, options.df) f = FileOutput("lda_test") f.out("corpus=%d, words=%d, K=%d, a=%f, b=%f" % (len(docs), len(voca.vocas), options.K, options.alpha, options.beta)) lda_learning(f, lda_cvb0.LDA_CVB0, False, options, docs, voca) lda_learning(f, lda_cvb0.LDA_CVB0, True, options, docs, voca) lda_learning(f, lda.LDA, False, options, docs, voca, 2) lda_learning(f, lda.LDA, True, options, docs, voca, 2)
def main(): import optparse parser = optparse.OptionParser() parser.add_option("-f", dest="filename", help="corpus filename") parser.add_option("-c", dest="corpus", help="using range of Brown corpus' files(start:end)") parser.add_option("--alpha", dest="alpha", type="float", help="parameter alpha", default=numpy.random.gamma(1, 1)) parser.add_option("--gamma", dest="gamma", type="float", help="parameter gamma", default=numpy.random.gamma(1, 1)) parser.add_option("--base", dest="base", type="float", help="parameter of base measure H", default=0.5) parser.add_option("-k", dest="K", type="int", help="initial number of topics", default=1) parser.add_option("-i", dest="iteration", type="int", help="iteration count", default=10) parser.add_option("-s", dest="stopwords", type="int", help="0=exclude stop words, 1=include stop words", default=1) parser.add_option("--seed", dest="seed", type="int", help="random seed") parser.add_option("--df", dest="df", type="int", help="threshold of document freaquency to cut words", default=0) (options, args) = parser.parse_args() if not (options.filename or options.corpus): parser.error("need corpus filename(-f) or corpus range(-c)") if options.seed != None: numpy.random.seed(options.seed) import vocabulary if options.filename: corpus = vocabulary.load_file(options.filename) else: corpus = vocabulary.load_corpus(options.corpus) if not corpus: parser.error("corpus range(-c) forms 'start:end'") voca = vocabulary.Vocabulary(options.stopwords == 0) docs = [voca.doc_to_ids(doc) for doc in corpus] if options.df > 0: docs = voca.cut_low_freq(docs, options.df) hdplda = HDPLDA(options.K, options.alpha, options.gamma, options.base, docs, voca.size()) print "corpus=%d words=%d alpha=%f gamma=%f base=%f initK=%d stopwords=%d" % ( len(corpus), len(voca.vocas), options.alpha, options.gamma, options.base, options.K, options.stopwords, ) # hdplda.dump() # import cProfile # cProfile.runctx('hdplda_learning(hdplda, options.iteration)', globals(), locals(), 'hdplda.profile') hdplda_learning(hdplda, options.iteration) phi = hdplda.worddist() for k, phi_k in enumerate(phi): print "\n-- topic: %d" % k for w in numpy.argsort(-phi_k)[:20]: print "%s: %f" % (voca[w], phi_k[w])
def main(): import optparse import vocabulary # 词汇包 parser = optparse.OptionParser() parser.add_option("-f", dest="filename", help="corpus filename") parser.add_option("-c", dest="corpus", help="using range of Brown corpus' files(start:end)") parser.add_option("--alpha", dest="alpha", type="float", help="parameter alpha", default=0.5) parser.add_option("--beta", dest="beta", type="float", help="parameter beta", default=0.5) parser.add_option("-k", dest="K", type="int", help="number of topics", default=20) parser.add_option("-i", dest="iteration", type="int", help="iteration count", default=100) parser.add_option("-s", dest="smartinit", action="store_true", help="smart initialize of parameters", default=False) parser.add_option("--stopwords", dest="stopwords", help="exclude stop words", action="store_true", default=False) parser.add_option("--seed", dest="seed", type="int", help="random seed") parser.add_option("--df", dest="df", type="int", help="threshold of document freaquency to cut words", default=0) (options, args) = parser.parse_args() # options = eval(str(options)) # for k,v in options.items(): # print(k,v) help(vocabulary) if not (options.filename or options.corpus): # 这两个值 本身就是 None parser.error("need corpus filename(-f) or corpus range(-c)") if options.filename: corpus = vocabulary.load_file(options.filename) else: corpus = vocabulary.load_corpus(options.corpus) if not corpus: parser.error("corpus range(-c) forms 'start:end'") if options.seed != None: numpy.random.seed(options.seed) voca = vocabulary.Vocabulary(options.stopwords) docs = [voca.doc_to_ids(doc) for doc in corpus] if options.df > 0: docs = voca.cut_low_freq(docs, options.df) lda = LDA(options.K, options.alpha, options.beta, docs, voca.size(), options.smartinit) print("corpus=%d,https://blog.csdn.net/baidu_15113429/article/details/79655247 words=%d, K=%d, a=%f, b=%f" % ( len(corpus), len(voca.vocas), options.K, options.alpha, options.beta)) # import cProfile #cProfile.runctx('lda_learning(lda, options.iteration, voca)', globals(), locals(), 'lda.profile') lda_learning(lda, options.iteration, voca)
def main(): import optparse parser = optparse.OptionParser() parser.add_option("-f", dest="filename", help="corpus filename") parser.add_option("-c", dest="corpus", help="using range of Brown corpus' files(start:end)") parser.add_option("--alpha", dest="alpha", type="float", help="parameter alpha", default=numpy.random.gamma(1, 1)) parser.add_option("--gamma", dest="gamma", type="float", help="parameter gamma", default=numpy.random.gamma(1, 1)) parser.add_option("--base", dest="base", type="float", help="parameter of base measure H", default=0.5) parser.add_option("-k", dest="K", type="int", help="initial number of topics", default=1) parser.add_option("-i", dest="iteration", type="int", help="iteration count", default=10) parser.add_option("-s", dest="stopwords", type="int", help="0=exclude stop words, 1=include stop words", default=1) parser.add_option("--seed", dest="seed", type="int", help="random seed") parser.add_option("--df", dest="df", type="int", help="threshold of document freaquency to cut words", default=0) (options, args) = parser.parse_args() if not (options.filename or options.corpus): parser.error("need corpus filename(-f) or corpus range(-c)") if options.seed != None: numpy.random.seed(options.seed) import vocabulary if options.filename: corpus = vocabulary.load_file(options.filename) else: corpus = vocabulary.load_corpus(options.corpus) if not corpus: parser.error("corpus range(-c) forms 'start:end'") voca = vocabulary.Vocabulary(options.stopwords==0) docs = [voca.doc_to_ids(doc) for doc in corpus] if options.df > 0: docs = voca.cut_low_freq(docs, options.df) hdplda = HDPLDA(options.K, options.alpha, options.gamma, options.base, docs, voca.size()) print "corpus=%d words=%d alpha=%f gamma=%f base=%f initK=%d stopwords=%d" % (len(corpus), len(voca.vocas), options.alpha, options.gamma, options.base, options.K, options.stopwords) #hdplda.dump() #import cProfile #cProfile.runctx('hdplda_learning(hdplda, options.iteration)', globals(), locals(), 'hdplda.profile') hdplda_learning(hdplda, options.iteration) phi = hdplda.worddist() for k, phi_k in enumerate(phi): print "\n-- topic: %d" % k for w in numpy.argsort(-phi_k)[:20]: print "%s: %f" % (voca[w], phi_k[w])
def main(): import optparse parser = optparse.OptionParser() parser.add_option("-f", dest="filename", help="corpus filename") parser.add_option("-c", dest="corpus", help="using range of Brown corpus' files(start:end)") parser.add_option("--alpha", dest="alpha", type="float", help="parameter alpha", default=numpy.random.gamma(1, 1)) parser.add_option("--gamma", dest="gamma", type="float", help="parameter gamma", default=numpy.random.gamma(1, 1)) parser.add_option("--beta", dest="beta", type="float", help="parameter of beta measure H", default=0.5) parser.add_option("-i", dest="iteration", type="int", help="iteration count", default=10) parser.add_option("-s", dest="stopwords", type="int", help="0=exclude stop words, 1=include stop words", default=1) parser.add_option("--seed", dest="seed", type="int", help="random seed") parser.add_option("--df", dest="df", type="int", help="threshold of document freaquency to cut words", default=0) (options, args) = parser.parse_args() if not (options.filename or options.corpus): parser.error("need corpus filename(-f) or corpus range(-c)") if options.seed != None: numpy.random.seed(options.seed) import vocabulary if options.filename: corpus = vocabulary.load_file(options.filename) else: corpus = vocabulary.load_corpus(options.corpus) if not corpus: parser.error("corpus range(-c) forms 'start:end'") voca = vocabulary.Vocabulary(options.stopwords==0) docs = [voca.doc_to_ids(doc) for doc in corpus] if options.df > 0: docs = voca.cut_low_freq(docs, options.df) hdplda = HDPLDA(options.alpha, options.gamma, options.beta, docs, voca.size()) print "corpus=%d words=%d alpha=%.3f gamma=%.3f beta=%.3f stopwords=%d" % (len(corpus), len(voca.vocas), options.alpha, options.gamma, options.beta, options.stopwords) #hdplda.dump() #import cProfile #cProfile.runctx('hdplda_learning(hdplda, options.iteration)', globals(), locals(), 'hdplda.profile') hdplda_learning(hdplda, options.iteration) output_summary(hdplda, voca)
def main(): import optparse import vocabulary parser = optparse.OptionParser() parser.add_option("--newsf", dest="newsfile", help="news corpus filename") parser.add_option("--tweetsf", dest="tweetsfile", help="tweets corpus filename") parser.add_option("-a", dest="authorfile", help="author filename") parser.add_option("-c", dest="corpus", help="using range of Brown corpus' files(start:end)") parser.add_option("--alpha", dest="alpha", type="float", help="parameter alpha", default=0.5) parser.add_option("--beta", dest="beta", type="float", help="parameter beta", default=0.5) parser.add_option("-k", dest="K", type="int", help="number of topics", default=20) parser.add_option("-i", dest="iteration", type="int", help="iteration count", default=100) parser.add_option("-s", dest="smartinit", action="store_true", help="smart initialize of parameters", default=False) parser.add_option("--stopwords", dest="stopwords", help="exclude stop words", action="store_true", default=False) parser.add_option("--seed", dest="seed", type="int", help="random seed") parser.add_option("--df", dest="df", type="int", help="threshold of document freaquency to cut words", default=0) (options, args) = parser.parse_args() if not (options.newsfile or options.corpus): parser.error("need corpus news file(--newsf) or corpus range(-c)") if not options.tweetsfile: parser.error("need corpus tweets file(--tweetsf)") if not options.authorfile: parser.error("need author file(-a)") if options.newsfile: news_corpus = vocabulary.load_file(options.newsfile) news_len = len(news_corpus) print "Load News data from '" + options.newsfile + "'" print "\t", news_len, "News in total" else: news_corpus = vocabulary.load_corpus(options.corpus) if not news_corpus: parser.error("corpus range(-c) forms 'start:end'") if options.seed != None: np.random.seed(options.seed) voca = vocabulary.Vocabulary(options.stopwords) print "Load Twitters data from '" + options.tweetsfile + "'" ori_twitter_corpus = vocabulary.load_file(options.tweetsfile, 'utf-8') print "Initialize the authors set" num_authors, author_set = vocabulary.load_author(options.authorfile) print "\t", num_authors, "authors in total" # Remove words less frequent twitter_dict = {} for line in ori_twitter_corpus: for w in line: if w in twitter_dict: twitter_dict[w] += 1 else: twitter_dict[w] = 1 twitter_corpus = [] for line in ori_twitter_corpus: for w in line: if twitter_dict[w] < 2: line.remove(w) twitter_corpus.append(line) twitter_corpus = twitter_corpus[:len(author_set)] twitter_len = len(ori_twitter_corpus) print "\t", twitter_len, "Tweets in total" # Whole collection corpus = news_corpus + twitter_corpus # voca = vocabulary.Vocabulary(options.stopwords) docs = [voca.doc_to_ids(doc) for doc in (corpus)] # docs is the documents list [[1,2,3],[4,2,3...]] twitter_words_set = set([w for doc in (twitter_corpus) for w in voca.doc_to_ids(doc)]) # is the Twitter list news_words_set = set([w for doc in (news_corpus) for w in voca.doc_to_ids(doc)]) # is the News list print "Number for Twitter words:", len(twitter_words_set) print "Number of News words:", len(news_words_set) if options.df > 0: docs = voca.cut_low_freq(docs, options.df) corpus_collection = list(set([w for doc in docs for w in doc])) # Initialization print "Initialize the heterogenous topic model" htm = HTM(options.K, options.alpha, options.beta, docs, news_len, num_authors, author_set, voca, twitter_words_set, news_words_set) # Get the results news_wt_distribution, tweets_wt_distribution, htm_wt_distribution, tweets_at_distribution, news_dt_distribution = htm.gibbs_sampling(options.iteration) print "KL from news to htm" KL_divergence(news_wt_distribution, htm_wt_distribution) print "KL from tweets to htm" KL_divergence(tweets_wt_distribution, htm_wt_distribution) print "KL from news to tweets" KL_divergence(news_wt_distribution, tweets_wt_distribution) print "KL from tweets to news" KL_divergence(tweets_wt_distribution, news_wt_distribution) htm.print_top_words(20, news_wt_distribution, voca.vocas) ''' Perplexity ''' perplexity = 0 N = 0 for line in htm_wt_distribution: for v in line: perplexity += np.log(v) N += len(line) print "Perplexity", np.exp(float(-perplexity)/N) htm.print_entropy() f = open(model + "news_wt.txt", "a") for line in news_wt_distribution: for n in line: f.write(str(n) + " ") f.write("\n") f.close() f = open(model + "tweets_wt.txt", "a") for line in tweets_wt_distribution: for n in line: f.write(str(n) + " ") f.write("\n") f.close() f = open(model + "htm_wt.txt", "a") for line in htm_wt_distribution: for n in line: f.write(str(n) + " ") f.write("\n") f.close() f = open(model + "tweets_at.txt", "a") for line in tweets_at_distribution: for n in line: f.write(str(n) + " ") f.write("\n") f.close() f = open(model + "news_dt.txt", "a") for line in news_dt_distribution: for n in line: f.write(str(n) + " ") f.write("\n") f.close()
def main(): import os import pickle import optparse parser = optparse.OptionParser() parser.add_option("-m", dest="model", help="model filename") parser.add_option("-f", dest="filename", help="corpus filename") parser.add_option("-b", dest="corpus", help="using range of Brown corpus' files(start:end)") parser.add_option("--alpha", dest="alpha", type="float", help="parameter alpha", default=0.1) parser.add_option("--beta", dest="beta", type="float", help="parameter beta", default=0.01) parser.add_option("--eta", dest="eta", type="float", help="parameter eta", default=100) parser.add_option("-k", dest="K", type="int", help="number of topics", default=20) parser.add_option("-i", dest="iteration", type="int", help="iteration count", default=10) parser.add_option("--seed", dest="seed", type="int", help="random seed") parser.add_option("--df", dest="df", type="int", help="threshold of document freaquency to cut words", default=0) parser.add_option( "-c", dest="constraint", help="add constraint (wordlist which should belong to the same topic)") parser.add_option("-u", "--unassign", dest="unassign", help="unassign method (all/doc/term/none)", default="none") (options, args) = parser.parse_args() numpy.random.seed(options.seed) if options.model and os.path.exists(options.model): with open(options.model, "rb") as f: lda, voca = pickle.load(f) elif not (options.filename or options.corpus): parser.error( "need corpus filename(-f) or corpus range(-b) or model(-m)") else: import vocabulary if options.filename: corpus = vocabulary.load_file(options.filename) else: corpus = vocabulary.load_corpus(options.corpus) if not corpus: parser.error("corpus range(-c) forms 'start:end'") voca = vocabulary.Vocabulary() docs = [voca.doc_to_ids(doc) for doc in corpus] if options.df > 0: docs = voca.cut_low_freq(docs, options.df) lda = ITM(options.K, options.alpha, options.beta, options.eta, docs, voca.size()) param = (len(lda.docs), len(voca.vocas), options.K, options.alpha, options.beta, options.eta) print "corpus=%d, words=%d, K=%d, a=%f, b=%f, eta=%f" % param if options.constraint: if options.unassign == "all": add_constraint = lda.add_constraint_all elif options.unassign == "doc": add_constraint = lda.add_constraint_doc elif options.unassign == "term": add_constraint = lda.add_constraint_term elif options.unassign == "none": add_constraint = lda.add_constraint_none else: parser.error("unassign method(-u) must be all/doc/term/none") wordlist = options.constraint.split(',') idlist = [voca.vocas_id[w] for w in wordlist] print "\n== add constraint ==" for w, v in zip(idlist, wordlist): print "%s [%s]" % (v, ",".join(str(x) for x in lda.n_k_w[:, w])) add_constraint(idlist) lda.verify_topic() #import cProfile #cProfile.runctx('lda_learning(lda, options.iteration, voca)', globals(), locals(), 'lda.profile') lda_learning(lda, options.iteration, voca) with open(options.model, "wb") as f: pickle.dump((lda, voca), f)
def main(): import optparse parser = optparse.OptionParser() parser.add_option("-f", dest="filename", help="corpus filename", default="1000_p.txt") parser.add_option("-c", dest="corpus", help="using range of Brown corpus' files(start:end)") parser.add_option("--alpha", dest="alpha", type="float", help="parameter alpha", default=numpy.random.gamma(1, 1)) parser.add_option("--gamma", dest="gamma", type="float", help="parameter gamma", default=numpy.random.gamma(1, 1)) parser.add_option("--beta", dest="beta", type="float", help="parameter of beta measure H", default=0.5) parser.add_option("-i", dest="iteration", type="int", help="iteration count", default=10) parser.add_option("-s", dest="stopwords", type="int", help="0=exclude stop words, 1=include stop words", default=1) parser.add_option("--seed", dest="seed", type="int", help="random seed") parser.add_option("--df", dest="df", type="int", help="threshold of document freaquency to cut words", default=0) (options, args) = parser.parse_args() if not (options.filename or options.corpus): parser.error("need corpus filename(-f) or corpus range(-c)") if options.seed != None: numpy.random.seed(options.seed) import vocabulary if options.filename: corpus = vocabulary.load_file(options.filename) else: corpus = vocabulary.load_corpus(options.corpus) if not corpus: parser.error("corpus range(-c) forms 'start:end'") voca = vocabulary.Vocabulary(options.stopwords == 0) docs = [voca.doc_to_ids(doc) for doc in corpus] if options.df > 0: docs = voca.cut_low_freq(docs, options.df) hdplda = HDPLDA(options.alpha, options.gamma, options.beta, docs, voca.size()) print("corpus=%d words=%d alpha=%.3f gamma=%.3f beta=%.3f stopwords=%d" % (len(corpus), len(voca.vocas), options.alpha, options.gamma, options.beta, options.stopwords)) # hdplda.dump() # import cProfile # cProfile.runctx('hdplda_learning(hdplda, options.iteration)', globals(), locals(), 'hdplda.profile') hdplda_learning(hdplda, options.iteration) output_summary(hdplda, voca)
def main(): import argparse import vocabulary parser = argparse.ArgumentParser() parser.add_argument("-f", dest="filename", help="Set corpus filepath. Fileformat is csv") parser.add_argument("-d", dest="document", help="Set document field name") parser.add_argument("-c", dest="corpus", help="Using range of Brown corpus' files(start:end)") parser.add_argument("--alpha", dest="alpha", type=float, help="Parameter alpha for LDA(default=1.0)", default=1.0) parser.add_argument("--beta", dest="beta", type=float, help="Parameter beta for LDA(default=0.1)", default=0.1) parser.add_argument("-k", dest="topics", type=int, help="Number of topics(default=20)", default=20) parser.add_argument("-i", dest="iteration", type=int, help="Iteration count(default=100)", default=100) parser.add_argument("-x", dest="X", type=str, help="Set prevalences column name", default=None) parser.add_argument("-y", dest="Y", type=str, help="Set covariates column name", default=None) parser.add_argument("--parser", dest="parser", help="Select parser eng_nltk or mecab(default=mecab)", default="mecab") parser.add_argument("--sigma", dest="sigma", help="Initial value of sigma diagonals(default=0.1)", default=0.1) parser.add_argument("--stopwords", dest="stopwords", help="Exclude stop words by using corpus from nltk", action="store_true", default=False) parser.add_argument("--seed", dest="seed", type=int, help="Random seed") parser.add_argument("--df", dest="df", type=int, help="Threshold of document freaquency to cut words", default=0) parser.add_argument( "--interact", dest="interact", action="store_true", help="Consider interaction between covariates adn topics", default=False) parser.add_argument("--sinit", dest="smartinit", action="store_true", help="Smart initialize of parameters for LDA", default=False) options = parser.parse_args() if not (options.filename or options.corpus): parser.error("need corpus filename(-f) or corpus range(-c)") if options.filename: load_doc = pd.read_csv(options.filename) if options.parser.lower() == "eng_nltk": corpus = vocabulary.load_dataframe(load_doc[options.document]) elif options.parser.lower() == "mecab": corpus = vocabulary.load_dataframe_jp(load_doc[options.document]) else: corpus = vocabulary.load_corpus(options.corpus) if not corpus: parser.error("corpus range(-c) forms 'start:end'") if options.seed is not None: np.random.seed(options.seed) print("proc voca") voca = vocabulary.Vocabulary(options.stopwords) docs = [voca.doc_to_ids(doc) for doc in corpus] # process prevarence, if it is pointed print("proc X") if options.X is not None: X = pd.get_dummies(load_doc[options.X.split(',')], drop_first=True).values X = np.concatenate((np.ones(X.shape[0])[:, np.newaxis], X), axis=1) else: X = options.X print("proc Y") if options.Y is not None: Y = pd.get_dummies(load_doc[[options.Y]], drop_first=True).values.flatten() else: Y = options.Y if options.df > 0: docs = voca.cut_low_freq(docs, options.df) print("set STM obj") stm_obj = stm.STM_factory_method(options.topics, X, Y, docs, voca.size(), options.sigma, options.interact) print("corpus=%d, words=%d, K=%d, a=%f, b=%f" % (len(corpus), len( voca.vocas), options.topics, options.alpha, options.beta)) # import cProfile # cProfile.runctx('lda_learning(lda, options.iteration, voca)', globals(), locals(), 'lda.profile') print("lda_initialize") stm_obj.lda_initialize(options.alpha, options.beta, 10, voca, options.smartinit) print("stm_learning") stm_obj.learning(options.iteration, voca)
def main(): import os import pickle import optparse parser = optparse.OptionParser() parser.add_option("-m", dest="model", help="model filename") parser.add_option("-f", dest="filename", help="corpus filename") parser.add_option("-b", dest="corpus", help="using range of Brown corpus' files(start:end)") parser.add_option("--alpha", dest="alpha", type="float", help="parameter alpha", default=0.1) parser.add_option("--beta", dest="beta", type="float", help="parameter beta", default=0.01) parser.add_option("--eta", dest="eta", type="float", help="parameter eta", default=100) parser.add_option("-k", dest="K", type="int", help="number of topics", default=20) parser.add_option("-i", dest="iteration", type="int", help="iteration count", default=10) parser.add_option("--seed", dest="seed", type="int", help="random seed") parser.add_option("--df", dest="df", type="int", help="threshold of document freaquency to cut words", default=0) parser.add_option("-c", dest="constraint", help="add constraint (wordlist which should belong to the same topic)") parser.add_option("-u", "--unassign", dest="unassign", help="unassign method (all/doc/term/none)", default="none") (options, args) = parser.parse_args() numpy.random.seed(options.seed) if options.model and os.path.exists(options.model): with open(options.model, "rb") as f: lda, voca = pickle.load(f) elif not (options.filename or options.corpus): parser.error("need corpus filename(-f) or corpus range(-b) or model(-m)") else: import vocabulary if options.filename: corpus = vocabulary.load_file(options.filename) else: corpus = vocabulary.load_corpus(options.corpus) if not corpus: parser.error("corpus range(-c) forms 'start:end'") voca = vocabulary.Vocabulary() docs = [voca.doc_to_ids(doc) for doc in corpus] if options.df > 0: docs = voca.cut_low_freq(docs, options.df) lda = ITM(options.K, options.alpha, options.beta, options.eta, docs, voca.size()) param = (len(lda.docs), len(voca.vocas), options.K, options.alpha, options.beta, options.eta) print "corpus=%d, words=%d, K=%d, a=%f, b=%f, eta=%f" % param if options.constraint: if options.unassign == "all": add_constraint = lda.add_constraint_all elif options.unassign == "doc": add_constraint = lda.add_constraint_doc elif options.unassign == "term": add_constraint = lda.add_constraint_term elif options.unassign == "none": add_constraint = lda.add_constraint_none else: parser.error("unassign method(-u) must be all/doc/term/none") wordlist = options.constraint.split(',') idlist = [voca.vocas_id[w] for w in wordlist] print "\n== add constraint ==" for w, v in zip(idlist, wordlist): print "%s [%s]" % (v, ",".join(str(x) for x in lda.n_k_w[:,w])) add_constraint(idlist) lda.verify_topic() #import cProfile #cProfile.runctx('lda_learning(lda, options.iteration, voca)', globals(), locals(), 'lda.profile') lda_learning(lda, options.iteration, voca) with open(options.model, "wb") as f: pickle.dump((lda, voca), f)
def main(): t1 = time.time() import optparse import vocabulary global out_dir parser = optparse.OptionParser() parser.add_option("-f", dest="filename", help="corpus filename") parser.add_option("-c", dest="corpus", help="using range of Brown corpus' files(start:end)") parser.add_option("--alpha", dest="alpha", type="float", help="parameter alpha", default=0.5) parser.add_option("--beta", dest="beta", type="float", help="parameter beta", default=0.5) parser.add_option("-k", dest="K", type="int", help="number of topics", default=20) parser.add_option("-i", dest="iteration", type="int", help="iteration count", default=100) parser.add_option("-s", dest="smartinit", action="store_true", help="smart initialize of parameters", default=False) parser.add_option("--stopwords", dest="stopwords", help="exclude stop words", action="store_true", default=False) parser.add_option("--seed", dest="seed", type="int", help="random seed") parser.add_option("--df", dest="df", type="int", help="threshold of document freaquency to cut words", default=0) (options, args) = parser.parse_args() if not (options.filename or options.corpus): parser.error("need corpus filename(-f) or corpus range(-c)") if options.filename: corpus, doc_ids, event_list = vocabulary.load_file(options.filename) else: corpus = vocabulary.load_corpus(options.corpus) if not corpus: parser.error("corpus range(-c) forms 'start:end'") if options.seed != None: numpy.random.seed(options.seed) voca = vocabulary.Vocabulary(options.stopwords) docs = [voca.doc_to_ids(doc) for doc in corpus] if options.df > 0: docs = voca.cut_low_freq(docs, options.df) if event_list is not None: options.K = len(event_list) suffix = datetime.now().strftime('%Y-%m-%d_%H:%M:%S') out_dir = '%s/all_words/Topic_%d_alpha_%f_beta_%f_iter_%d/%s' % ( out_dir, options.K, options.alpha, options.beta, options.iteration, suffix) try: os.makedirs(out_dir) except Exception, e: print ' %s Dir exist ' % (out_dir) print 'E MSG : ', e
def main(): t1 = time.time() import optparse import vocabulary global out_dir parser = optparse.OptionParser() parser.add_option("-f", dest="filename", help="corpus filename") parser.add_option("-c", dest="corpus", help="using range of Brown corpus' files(start:end)") parser.add_option("--alpha", dest="alpha", type="float", help="parameter alpha", default=0.1) parser.add_option("--eta", dest="eta", type="float", help="parameter eta", default=0.2) parser.add_option("-k", dest="K", type="int", help="number of topics", default=20) parser.add_option("-i", dest="iteration", type="int", help="iteration count", default=100) parser.add_option("-s", dest="smartinit", action="store_true", help="smart initialize of parameters", default=False) parser.add_option("--stopwords", dest="stopwords", help="exclude stop words", action="store_true", default=False) parser.add_option("--seed", dest="seed", type="int", help="random seed") parser.add_option("--df", dest="df", type="int", help="threshold of document freaquency to cut words", default=0) #parser.add_option("--setup", dest="setup", help="setup details", default="uniform") parser.add_option("--dataset", dest="did", help="setup details : Dataset-1/Dataset-2/Dataset-3", default="Dataset-1") (options, args) = parser.parse_args() if not (options.filename or options.corpus): parser.error("need corpus filename(-f) or corpus range(-c)") if options.filename: if options.did == 'Dataset-1': corpus, doc_ids, event_list, total_no_word = vocabulary.load_file( options.filename) else: corpus, doc_ids, event_list, total_no_word = vocabulary.load_file_reuter( options.filename) else: corpus = vocabulary.load_corpus(options.corpus) if not corpus: parser.error("corpus range(-c) forms 'start:end'") if options.seed != None: numpy.random.seed(options.seed) voca = vocabulary.Vocabulary(options.stopwords) docs = [voca.doc_to_ids(doc) for doc in corpus] if options.df > 0: docs = voca.cut_low_freq(docs, options.df) if event_list is not None: options.K = options.K #len(event_list) suffix = datetime.now().strftime('%Y-%m-%d_%H:%M:%S') #out_dir = '%s/all_words/Topic_%d_alpha_%f_eta_%f_iter_%d/%s' %(out_dir,options.K,options.alpha, options.eta, options.iteration, suffix) #out_dir = '%s/Dataset-1/Topic_%d_alpha_%f_eta_%f_iter_%d/%s' %(out_dir,options.K,options.alpha, options.eta, options.iteration, suffix) out_dir = '%s/%s/Topic_%d_alpha_%f_eta_%f_iter_%d/%s' % ( out_dir, options.did, options.K, options.alpha, options.eta, options.iteration, suffix) #out_dir = '%s/Reuters-21578/R-8-train-train_no-stop/Topic_%d_alpha_%f_eta_%f_iter_%d/%s' %(out_dir,options.K,options.alpha, options.eta, options.iteration, suffix) #out_dir = '%s/20-Newsgroup/20-Newsgroup_train-train_all_term/Topic_%d_alpha_%f_eta_%f_iter_%d/%s' %(out_dir,options.K,options.alpha, options.eta, options.iteration, suffix) print('out_dir: ', out_dir) try: os.makedirs(out_dir) except Exception as e: print(' %s Dir exist ' % (out_dir)) print('E MSG : ', e) lda = LDA(options.K, options.alpha, options.eta, docs, doc_ids, voca.size(), options.smartinit) t_int = time.time() #print 'Intialization time : %f' %(t_int-t1) flog = '%s/log_file.txt' % (out_dir) f = open(flog, 'w') f.write( "corpus(# of doc)=%d, no of event = %d , Uniq words=%d, Toal # of word =%d, K=%d, a=%f, b=%f , iteration = %d \n" % (len(corpus), len(event_list), len(voca.vocas), total_no_word, options.K, options.alpha, options.eta, options.iteration)) f.close() print("corpus=%d, no of event =%d , uniq words=%d, K=%d, a=%f, b=%f" % (len(corpus), len(event_list), len( voca.vocas), options.K, options.alpha, options.eta)), #import cProfile #cProfile.runctx('lda_learning(lda, options.iteration, voca)', globals(), locals(), 'lda.profile') lda_learning(lda, options.iteration, voca) t2 = time.time() print(' Total time taken : %f ' % (t2 - t1)) flog = '%s/log_file.txt' % (out_dir) f = open(flog, 'a') f.write(' TOtal time taken : %f ' % (t2 - t1)) f.close()
def main(): import optparse import vocabulary parser = optparse.OptionParser() parser.add_option("-f", dest="filename", help="corpus filename") parser.add_option("-c", dest="corpus", help="using range of Brown corpus' files(start:end)") parser.add_option("--alpha", dest="alpha", type="float", help="parameter alpha", default=0.5) parser.add_option("--beta", dest="beta", type="float", help="parameter beta", default=0.5) parser.add_option("-k", dest="K", type="int", help="number of topics", default=20) parser.add_option("-i", dest="iteration", type="int", help="iteration count", default=100) parser.add_option("-s", dest="smartinit", action="store_true", help="smart initialize of parameters", default=False) parser.add_option("--stopwords", dest="stopwords", help="exclude stop words", action="store_true", default=False) parser.add_option("--seed", dest="seed", type="int", help="random seed") parser.add_option("--df", dest="df", type="int", help="threshold of document freaquency to cut words", default=0) (options, args) = parser.parse_args() if not (options.filename or options.corpus): parser.error("need corpus filename(-f) or corpus range(-c)") if options.filename: corpus = vocabulary.load_file(options.filename) else: corpus = vocabulary.load_corpus(options.corpus) if not corpus: parser.error("corpus range(-c) forms 'start:end'") if options.seed != None: numpy.random.seed(options.seed) voca = vocabulary.Vocabulary(options.stopwords) docs = [voca.doc_to_ids(doc) for doc in corpus] if options.df > 0: docs = voca.cut_low_freq(docs, options.df) lda = LDA(options.K, options.alpha, options.beta, docs, voca.size(), options.smartinit) print ("corpus=%d, words=%d, K=%d, a=%f, b=%f" % (len(corpus), len(voca.vocas), options.K, options.alpha, options.beta)) #import cProfile #cProfile.runctx('lda_learning(lda, options.iteration, voca)', globals(), locals(), 'lda.profile') wt_distribution, dt_distribution = lda_learning(lda, options.iteration, voca) # Entropy entropy = [] num_topics, num_words = wt_distribution.shape num_docs = dt_distribution.shape[1] for t in range(num_topics): probs = 0.0 for doc in docs[:27685]: prob = 0.0 if len(doc) == 0: continue for w in doc: prob -= math.log(wt_distribution[t, w]*1.0) prob = prob/len(doc) probs += prob entropy.append(probs/len(docs[:27685])) print entropy entropy = [] for t in range(num_topics): probs = 0.0 for doc in docs[27685:]: prob = 0.0 if len(doc) == 0: continue for w in doc: prob -= math.log(wt_distribution[t, w]*1.0) prob = prob/len(doc) probs += prob entropy.append(probs/len(docs[27685:])) print entropy entropy = [] for t in range(num_topics): probs = 0.0 for doc in docs: prob = 0.0 if len(doc) == 0: continue for w in doc: prob -= math.log(wt_distribution[t, w]*1.0) prob = prob/len(doc) probs += prob entropy.append(probs/len(docs)) print entropy ''' Perplexity ''' perplexity = 0 N = 0 for line in wt_distribution: for v in line: perplexity += numpy.log(v) N += len(line) print N print "Perplexity", numpy.exp(float(-perplexity)/N) model = "./model_tlda/" print wt_distribution.shape f = open(model + "wt.txt", "a") for line in wt_distribution: for n in line: f.write(str(n) + " ") f.write("\n") f.close() f = open(model + "dt.txt", "a") print dt_distribution.shape for line in dt_distribution: for n in line: f.write(str(n) + " ") f.write("\n") f.close()
def main(): t1 = time.time() import optparse import vocabulary global out_dir parser = optparse.OptionParser() parser.add_option("-f", dest="filename", help="corpus filename") parser.add_option("-c", dest="corpus", help="using range of Brown corpus' files(start:end)") parser.add_option("--alpha", dest="alpha", type="float", help="parameter alpha", default=0.1) parser.add_option("--eta1", dest="eta1", type="float", help="parameter eta for ner word", default=0.4) parser.add_option("--eta2", dest="eta2", type="float", help="parameter eta for Non-ner word", default=0.2) parser.add_option("-k", dest="K", type="int", help="number of topics", default=20) parser.add_option("-i", dest="iteration", type="int", help="iteration count", default=100) parser.add_option("-s", dest="smartinit", action="store_true", help="smart initialize of parameters", default=False) parser.add_option("--stopwords", dest="stopwords", help="exclude stop words", action="store_true", default=False) parser.add_option("--seed", dest="seed", type="int", help="random seed") parser.add_option("--df", dest="df", type="int", help="threshold of document freaquency to cut words", default=0) parser.add_option("--setup", dest="setup", help="setup details : ner_keywords/tf-df-iec/IG", default="ner_keywords") parser.add_option("--dataset", dest="did", help="setup details : Dataset-1/Dataset-2/Dataset-3", default="Dataset-1") (options, args) = parser.parse_args() if not (options.filename or options.corpus): parser.error("need corpus filename(-f) or corpus range(-c)") if options.filename: if options.did == 'Dataset-1': corpus, doc_ids, event_list, total_no_word = vocabulary.load_file( options.filename) else: corpus, doc_ids, event_list, total_no_word = vocabulary.load_file_reuter( options.filename) else: corpus = vocabulary.load_corpus(options.corpus) if not corpus: parser.error("corpus range(-c) forms 'start:end'") if options.seed != None: np.random.seed(options.seed) # fname_sp = options.filename_sp.replace('/', '-') # if 'ner_keywords' in options.setup: # out_dir = '%s/%s/%s/%s_Topic-%d_alpha-%0.2f_eta2-%0.2f_eta1-%0.2f_iter_%d/%s' %(out_dir, options.did, # options.setup, options.did, options.K, options.alpha, options.eta2, options.eta1, options.iteration, suffix) # elif 'tf-df-icf' in options.setup: # out_dir = '%s/%s/%s/%s_Topic-%d_alpha-%0.2f_eta2-%0.2f_eta1-%0.2f_iter_%d/%s' %(out_dir, options.did, # options.setup, options.did, options.K, options.alpha, options.eta2, options.eta1, options.iteration, suffix) # else: # print('Out Directory is not defined') # return # print(' out_dir line 448 : : ' , out_dir) # try: # os.makedirs(out_dir) # except Exception as e: # print(' %s Dir exist ' %(out_dir)) file_name_list = [ options.did, 'Topic-' + str(options.K), 'alpha-' + str(options.alpha), 'eta1-' + str(options.eta1), 'eta2-' + str(options.eta2), 'iter_' + str(options.iteration) ] suffix = datetime.now().strftime('%Y-%m-%d_%H:%M:%S') out_dir = os.path.join(out_dir, options.did, options.setup, '_'.join(file_name_list), suffix) try: os.makedirs(out_dir) except: print('% dir exists !' % (out_dir)) voca = vocabulary.Vocabulary(options.stopwords) docs = [voca.doc_to_ids(doc) for doc in corpus] if options.df > 0: docs = voca.cut_low_freq(docs, options.df) X = np.zeros((len(docs), len(voca.vocas)), dtype=np.int) for i, doc in enumerate(docs): for j, words in enumerate(doc): X[i][words] += 1 for i in range(len(docs)): for j in range(len(voca.vocas)): if X[i][j] < 0: print(' Value less than zero :', i, j, X[i][j], voca.vocas[j]) # Guided LDA with seed topics. # seed_topics = {} # for t_id, st in enumerate(seed_topic_list): # for word in st: # seed_topics[voca.vocas_id[word]] = t_id seed_topics_dir = os.path.join(SEED_DIR, options.did, options.setup) seed_topics_fname = '{}-{}.json'.format(options.did, options.setup) seed_topics_fname_total = os.path.join(seed_topics_dir, seed_topics_fname) seed_topics = load_seed_word(seed_topics_fname_total, voca.vocas_id, event_list) # saving to call graph model = guidedlda.GuidedLDA(n_topics=options.K, n_iter=options.iteration + 1, alpha=options.alpha, eta=options.eta2, random_state=options.K, refresh=20) #model = guidedlda.GuidedLDA(n_topics= options.K, n_iter= options.iteration + 1 , alpha = options.alpha, eta = options.eta2, random_state= options.K, refresh=20) model.fit(X, seed_topics=seed_topics, seed_confidence=options.eta1) # #model.fit(X) # writing to file doc-topic doc_topic = model.transform(X) fout_doc_topic = '%s/doc_topic_dist.txt' % (out_dir) fdoc = open(fout_doc_topic, 'w') st_doc_topic = '' for i, item in enumerate(docs): st_doc_topic += "{} : Topic_{}\n".format(doc_ids[i], doc_topic[i].argmax()) fdoc.write(st_doc_topic) fdoc.close() # Writing to file doc_topic_dist_score.csv topic_list = [] for i in range(options.K): topic_list.append('Topic_%03d' % (i)) print(doc_topic.shape, len(topic_list), len(doc_ids)) df = pd.DataFrame(data=doc_topic, columns=topic_list, index=doc_ids) #print(df.head) fout_doc_topic_score = os.path.join(out_dir, 'doc_topic_dist_score.csv') df.to_csv(fout_doc_topic_score) # Writing to file topic-word n_top_words = 20 topic_word = model.topic_word_ fout_topic_word = '%s/topic_word_dist.txt' % (out_dir) ftopic = open(fout_topic_word, 'w') st_topic_word = '' for i, topic_dist in enumerate(topic_word): word_list = np.array( voca.vocas)[np.argsort(topic_dist)][:-(n_top_words + 1):-1] score_list = np.argsort(topic_dist)[:-(n_top_words + 1):-1] st_topic_word += '\n\n\nTopic : {}\n-------------------\n'.format(i) st = '' for j, word in enumerate(word_list): st += '{}:{}\n'.format(word, topic_dist[score_list[j]]) st_topic_word += st #print(docs) ftopic.write(st_topic_word) ftopic.close()