def main(): import vocabulary # from sklearn.decomposition import PCA import pickle corpus = vocabulary.load_file('mood.txt') voca = vocabulary.Vocabulary(True) docs = [voca.doc_to_ids(doc) for doc in corpus] doctrain = docs[:450] doctest = docs[450:] # docs = voca.cut_low_freq(docs, 1) # SET parameter K = 10 # number of topics alpha, beta = 0.5, 0.5 V = voca.size() f = FileOutput("lda_trainning") f.out("corpus=%d, words=%d, K=%d, alpha=%f, beta=%f" % (len(docs), len(voca.vocas), K, alpha, beta)) LDA = lda.LDA(K, alpha, beta, docs, V) lda_training(f, LDA, voca, iteration=30) # set number of iterations theta = LDA.theta()[:450] newtheta = LDA.theta()[450:] with open("theta.pk", 'wb') as f: pickle.dump(theta, f) with open("newtheta.pk", 'wb') as f: pickle.dump(newtheta, f)
def main(): import optparse import vocabulary parser = optparse.OptionParser() parser.add_option("-f", dest="filename", help="corpus filename") parser.add_option("-c", dest="corpus", help="using range of Brown corpus' files(start:end)") parser.add_option("--alpha", dest="alpha", type="float", help="parameter alpha", default=0.5) parser.add_option("--beta", dest="beta", type="float", help="parameter beta", default=0.5) parser.add_option("-k", dest="K", type="int", help="number of topics", default=20) parser.add_option("-i", dest="iteration", type="int", help="iteration count", default=100) parser.add_option("-s", dest="smartinit", action="store_true", help="smart initialize of parameters", default=False) parser.add_option("--stopwords", dest="stopwords", help="exclude stop words", action="store_true", default=False) parser.add_option("--seed", dest="seed", type="int", help="random seed") parser.add_option("--df", dest="df", type="int", help="threshold of document freaquency to cut words", default=0) (options, args) = parser.parse_args() if not (options.filename or options.corpus): parser.error("need corpus filename(-f) or corpus range(-c)") if options.filename: corpus = vocabulary.load_file(options.filename) else: corpus = vocabulary.load_corpus(options.corpus) if not corpus: parser.error("corpus range(-c) forms 'start:end'") if options.seed != None: numpy.random.seed(options.seed) voca = vocabulary.Vocabulary(options.stopwords) docs = [voca.doc_to_ids(doc) for doc in corpus] if options.df > 0: docs = voca.cut_low_freq(docs, options.df) lda = LDA(options.K, options.alpha, options.beta, docs, voca.size(), options.smartinit) print "corpus=%d, words=%d, K=%d, a=%f, b=%f" % (len(corpus), len(voca.vocas), options.K, options.alpha, options.beta) #import cProfile #cProfile.runctx('lda_learning(lda, options.iteration, voca)', globals(), locals(), 'lda.profile') lda_learning(lda, options.iteration, voca)
def main(): import optparse import vocabulary parser = optparse.OptionParser() parser.add_option("-f", dest="filename", help="corpus filename") parser.add_option("-c", dest="corpus", help="using range of Brown corpus' files(start:end)") parser.add_option("--alpha", dest="alpha", type="float", help="parameter alpha", default=0.5) parser.add_option("--beta", dest="beta", type="float", help="parameter beta", default=0.5) parser.add_option("-k", dest="K", type="int", help="number of topics", default=20) parser.add_option("-i", dest="iteration", type="int", help="iteration count", default=100) parser.add_option("-s", dest="smartinit", action="store_true", help="smart initialize of parameters", default=False) parser.add_option("--stopwords", dest="stopwords", help="exclude stop words", action="store_true", default=False) parser.add_option("--seed", dest="seed", type="int", help="random seed") parser.add_option("--df", dest="df", type="int", help="threshold of document freaquency to cut words", default=0) (options, args) = parser.parse_args() if not (options.filename or options.corpus): parser.error("need corpus filename(-f) or corpus range(-c)") if options.filename: corpus = vocabulary.load_file(options.filename) else: corpus = vocabulary.load_corpus(options.corpus) if not corpus: parser.error("corpus range(-c) forms 'start:end'") if options.seed != None: numpy.random.seed(options.seed) voca = vocabulary.Vocabulary(options.stopwords) docs = [voca.doc_to_ids(doc) for doc in corpus] if options.df > 0: docs = voca.cut_low_freq(docs, options.df) lda = LDA(options.K, options.alpha, options.beta, docs, voca.size(), options.smartinit) #import cProfile #cProfile.runctx('lda_learning(lda, options.iteration, voca)', globals(), locals(), 'lda.profile') lda_learning(lda, options.iteration, voca)
def getVocabulary(): import vocabulary import random data_path = "/media/University/UniversityDisc/2-Master/MasterThesis/EjecucionTesis/Desarrollo/PythonProjects/Data/" filename = data_path + "aolGoals.txt" stopwords=True df=0 #(options, args) = parser.parse_args() if not (filename or corpus): "need corpus filename(-l) or corpus range(-l)" if filename: corpus = vocabulary.load_file(filename) else: corpus = vocabulary.load_corpus(corpus) if not corpus: "corpus range(-l) forms 'start:end'" cp = list(corpus) random.seed(326719) random.shuffle(cp) p = int(len(cp) * .7) cp_train = cp[0:p] cp_test = cp[p:] print "Corpus to Test:", len(cp_test) print "Corpus to Train:", len(cp_train) voca = vocabulary.Vocabulary(stopwords) docs = [voca.doc_to_ids(doc) for doc in cp_train] if df > 0: docs = voca.cut_low_freq(docs, df) return voca, docs, cp_train
def main(): import optparse import vocabulary parser = optparse.OptionParser() parser.add_option("-f", dest="filename", help="corpus filename") parser.add_option("--alpha", dest="alpha", type="float", help="parameter alpha", default=0.5) parser.add_option("--beta", dest="beta", type="float", help="parameter beta", default=0.5) parser.add_option("--lamda", dest="lamda", type="float", help="parameter lamda", default=0.5) parser.add_option("-k", dest="K", type="int", help="number of topics", default=20) parser.add_option("-i", dest="iteration", type="int", help="iteration count", default=100) parser.add_option("-s", dest="smartinit", action="store_true", help="smart initialize of parameters", default=False) parser.add_option("--stopwords", dest="stopwords", help="exclude stop words", action="store_true", default=False) parser.add_option("--seed", dest="seed", type="int", help="random seed") parser.add_option("--df", dest="df", type="int", help="threshold of document freaquency to cut words", default=0) (options, args) = parser.parse_args() if not (options.filename): parser.error("need corpus filename(-f) or corpus range(-c)") if options.filename: (pids,tids) = vocabulary.load_file(options.filename) if options.seed != None: numpy.random.seed(options.seed) #voca is the object which stores the data structures needed by LDA voca = vocabulary.Vocabulary(options.stopwords) docs = voca.PT_to_idlist(pids, tids) #print docs size_of_vocab = max(tids) + 1; lda = BLDA(options.K, options.alpha, options.beta, options.lamda, docs, size_of_vocab, options.smartinit) #print "corpus=%d, words=%d, K=%d, a=%f, b=%f" % (len(corpus), len(voca.vocas), options.K, options.alpha, options.beta) blda_learning(lda, options.iteration)
def main(): import optparse import vocabulary global out_dir parser = optparse.OptionParser() parser.add_option("-f", dest="filename", help="corpus filename") parser.add_option("--per", dest="per_weight", type="float", help="person weight", default=0.2) parser.add_option("--loc", dest="loc_weight", type="float", help="location weight", default=0.4) parser.add_option("--org", dest="org_weight", type="float", help="organisation weight", default=0.1) (options, args) = parser.parse_args() ''' parser.add_option("-c", dest="corpus", help="using range of Brown corpus' files(start:end)") parser.add_option("--alpha", dest="alpha", type="float", help="parameter alpha", default=0.5) parser.add_option("--eta1", dest="eta1", type="float", help="parameter eta for ner word", default=0.4) parser.add_option("--eta2", dest="eta2", type="float", help="parameter eta for Non-ner word", default=0.2) parser.add_option("-k", dest="K", type="int", help="number of topics", default=20) parser.add_option("-i", dest="iteration", type="int", help="iteration count", default=10) parser.add_option("-s", dest="smartinit", action="store_true", help="smart initialize of parameters", default=False) parser.add_option("--stopwords", dest="stopwords", help="exclude stop words", action="store_true", default=False) parser.add_option("--seed", dest="seed", type="int", help="random seed") parser.add_option("--df", dest="df", type="int", help="threshold of document freaquency to cut words", default=0) (options, args) = parser.parse_args() pass ''' if options.filename: corpus, doc_ids, event_list = vocabulary.load_file(options.filename) else: options.filename = 'filtered_event_new2.pkl' corpus, doc_ids, event_list = vocabulary.load_file(options.filename) load_file(options.filename, options.per_weight, options.loc_weight, options.org_weight)
def main(): corpus = vocabulary.load_file("small_train.txt") voca = vocabulary.Vocabulary() docs = [voca.doc_to_ids(doc) for doc in corpus] if options.df > 0: docs = voca.cut_low_freq(docs, options.df) lda = LDA(20, 0.5, 0.5, docs, voca.size(), False) lda_learning(lda, 100, voca)
def main(): import optparse parser = optparse.OptionParser() parser.add_option("-f", dest="filename", help="corpus filename") parser.add_option("-c", dest="corpus", help="using range of Brown corpus' files(start:end)") parser.add_option("--alpha", dest="alpha", type="float", help="parameter alpha", default=numpy.random.gamma(1, 1)) parser.add_option("--gamma", dest="gamma", type="float", help="parameter gamma", default=numpy.random.gamma(1, 1)) parser.add_option("--base", dest="base", type="float", help="parameter of base measure H", default=0.5) parser.add_option("-k", dest="K", type="int", help="initial number of topics", default=1) parser.add_option("-i", dest="iteration", type="int", help="iteration count", default=10) parser.add_option("-s", dest="stopwords", type="int", help="0=exclude stop words, 1=include stop words", default=1) parser.add_option("--seed", dest="seed", type="int", help="random seed") parser.add_option("--df", dest="df", type="int", help="threshold of document freaquency to cut words", default=0) (options, args) = parser.parse_args() if not (options.filename or options.corpus): parser.error("need corpus filename(-f) or corpus range(-c)") if options.seed != None: numpy.random.seed(options.seed) import vocabulary if options.filename: corpus = vocabulary.load_file(options.filename) else: corpus = vocabulary.load_corpus(options.corpus) if not corpus: parser.error("corpus range(-c) forms 'start:end'") voca = vocabulary.Vocabulary(options.stopwords == 0) docs = [voca.doc_to_ids(doc) for doc in corpus] if options.df > 0: docs = voca.cut_low_freq(docs, options.df) hdplda = HDPLDA(options.K, options.alpha, options.gamma, options.base, docs, voca.size()) print "corpus=%d words=%d alpha=%f gamma=%f base=%f initK=%d stopwords=%d" % ( len(corpus), len(voca.vocas), options.alpha, options.gamma, options.base, options.K, options.stopwords, ) # hdplda.dump() # import cProfile # cProfile.runctx('hdplda_learning(hdplda, options.iteration)', globals(), locals(), 'hdplda.profile') hdplda_learning(hdplda, options.iteration) phi = hdplda.worddist() for k, phi_k in enumerate(phi): print "\n-- topic: %d" % k for w in numpy.argsort(-phi_k)[:20]: print "%s: %f" % (voca[w], phi_k[w])
def main(): import optparse import vocabulary parser = optparse.OptionParser() parser.add_option("-f", dest="corpus_filename", help="corpus filename") parser.add_option("--alpha", dest="alpha", type="float", help="parameter alpha", default=0.5) parser.add_option("--beta", dest="beta", type="float", help="parameter beta", default=0.5) parser.add_option("-k", dest="K", type="int", help="number of topics", default=20) parser.add_option("-i", dest="iteration", type="int", help="iteration count", default=100) #parser.add_option("-s", dest="smartinit", action="store_true", help="smart initialize of parameters", default=False) parser.add_option("--stopwords", dest="stopwords", help="exclude stop words", action="store_true", default=False) parser.add_option("--seed", dest="seed", type="int", help="random seed") parser.add_option("--df", dest="df", type="int", help="threshold of document frequency to cut words", default=0) (options, args) = parser.parse_args() if options.corpus_filename: corpus = vocabulary.load_file(options.corpus_filename) else: corpus = null if not corpus: parser.error("corpus range(-c) forms 'start:end'") if options.seed != None: numpy.random.seed(options.seed) voca = vocabulary.Vocabulary(options.stopwords) docs = [voca.doc_to_ids(doc) for doc in corpus] if options.df > 0: docs = voca.cut_low_freq(docs, options.df) lda = LDA(options.K, options.alpha, options.beta, docs, voca.size()) #print "corpus=%d, words=%d, K=%d, a=%f, b=%f" % (len(corpus), len(voca.vocas), options.numberOfTopics, options.alpha, options.beta) lda_learning(lda, options.iteration, voca)
def main(): import optparse import vocabulary # 词汇包 parser = optparse.OptionParser() parser.add_option("-f", dest="filename", help="corpus filename") parser.add_option("-c", dest="corpus", help="using range of Brown corpus' files(start:end)") parser.add_option("--alpha", dest="alpha", type="float", help="parameter alpha", default=0.5) parser.add_option("--beta", dest="beta", type="float", help="parameter beta", default=0.5) parser.add_option("-k", dest="K", type="int", help="number of topics", default=20) parser.add_option("-i", dest="iteration", type="int", help="iteration count", default=100) parser.add_option("-s", dest="smartinit", action="store_true", help="smart initialize of parameters", default=False) parser.add_option("--stopwords", dest="stopwords", help="exclude stop words", action="store_true", default=False) parser.add_option("--seed", dest="seed", type="int", help="random seed") parser.add_option("--df", dest="df", type="int", help="threshold of document freaquency to cut words", default=0) (options, args) = parser.parse_args() # options = eval(str(options)) # for k,v in options.items(): # print(k,v) help(vocabulary) if not (options.filename or options.corpus): # 这两个值 本身就是 None parser.error("need corpus filename(-f) or corpus range(-c)") if options.filename: corpus = vocabulary.load_file(options.filename) else: corpus = vocabulary.load_corpus(options.corpus) if not corpus: parser.error("corpus range(-c) forms 'start:end'") if options.seed != None: numpy.random.seed(options.seed) voca = vocabulary.Vocabulary(options.stopwords) docs = [voca.doc_to_ids(doc) for doc in corpus] if options.df > 0: docs = voca.cut_low_freq(docs, options.df) lda = LDA(options.K, options.alpha, options.beta, docs, voca.size(), options.smartinit) print("corpus=%d,https://blog.csdn.net/baidu_15113429/article/details/79655247 words=%d, K=%d, a=%f, b=%f" % ( len(corpus), len(voca.vocas), options.K, options.alpha, options.beta)) # import cProfile #cProfile.runctx('lda_learning(lda, options.iteration, voca)', globals(), locals(), 'lda.profile') lda_learning(lda, options.iteration, voca)
def main(): import optparse parser = optparse.OptionParser() parser.add_option("-f", dest="filename", help="corpus filename") parser.add_option("-c", dest="corpus", help="using range of Brown corpus' files(start:end)") parser.add_option("--alpha", dest="alpha", type="float", help="parameter alpha", default=numpy.random.gamma(1, 1)) parser.add_option("--gamma", dest="gamma", type="float", help="parameter gamma", default=numpy.random.gamma(1, 1)) parser.add_option("--base", dest="base", type="float", help="parameter of base measure H", default=0.5) parser.add_option("-k", dest="K", type="int", help="initial number of topics", default=1) parser.add_option("-i", dest="iteration", type="int", help="iteration count", default=10) parser.add_option("-s", dest="stopwords", type="int", help="0=exclude stop words, 1=include stop words", default=1) parser.add_option("--seed", dest="seed", type="int", help="random seed") parser.add_option("--df", dest="df", type="int", help="threshold of document freaquency to cut words", default=0) (options, args) = parser.parse_args() if not (options.filename or options.corpus): parser.error("need corpus filename(-f) or corpus range(-c)") if options.seed != None: numpy.random.seed(options.seed) import vocabulary if options.filename: corpus = vocabulary.load_file(options.filename) else: corpus = vocabulary.load_corpus(options.corpus) if not corpus: parser.error("corpus range(-c) forms 'start:end'") voca = vocabulary.Vocabulary(options.stopwords==0) docs = [voca.doc_to_ids(doc) for doc in corpus] if options.df > 0: docs = voca.cut_low_freq(docs, options.df) hdplda = HDPLDA(options.K, options.alpha, options.gamma, options.base, docs, voca.size()) print "corpus=%d words=%d alpha=%f gamma=%f base=%f initK=%d stopwords=%d" % (len(corpus), len(voca.vocas), options.alpha, options.gamma, options.base, options.K, options.stopwords) #hdplda.dump() #import cProfile #cProfile.runctx('hdplda_learning(hdplda, options.iteration)', globals(), locals(), 'hdplda.profile') hdplda_learning(hdplda, options.iteration) phi = hdplda.worddist() for k, phi_k in enumerate(phi): print "\n-- topic: %d" % k for w in numpy.argsort(-phi_k)[:20]: print "%s: %f" % (voca[w], phi_k[w])
def main(): import optparse import vocabulary parser = optparse.OptionParser() parser.add_option("-f", dest="filename", help="corpus filename", default='complete_document_one_2_one.pk') parser.add_option("-c", dest="corpus", help="using range of Brown corpus' files(start:end)") parser.add_option("--alpha", dest="alpha", type="float", help="parameter alpha", default=0.1) parser.add_option("--beta", dest="beta", type="float", help="parameter beta", default=0.1) parser.add_option("-k", dest="K", type="int", help="number of topics", default=10) parser.add_option("-i", dest="iteration", type="int", help="iteration count", default=500) parser.add_option("-s", dest="smartinit", action="store_true", help="smart initialize of parameters", default=True) parser.add_option("--stopwords", dest="stopwords", help="exclude stop words", action="store_true", default=False) parser.add_option("--seed", dest="seed", type="int", help="random seed") parser.add_option("--df", dest="df", type="int", help="threshold of document freaquency to cut words", default=0) (options, args) = parser.parse_args() if not (options.filename or options.corpus): parser.error("need corpus filename(-f) or corpus range(-c)") corpus = vocabulary.load_file(options.filename) if options.seed != None: numpy.random.seed(options.seed) lda = LDA(options.K, options.alpha, options.beta, corpus, [300, 1000], options.smartinit) lda_learning(lda, options.iteration)
def main(): import optparse parser = optparse.OptionParser() parser.add_option("-f", dest="filename", help="corpus filename") parser.add_option("-c", dest="corpus", help="using range of Brown corpus' files(start:end)") parser.add_option("--alpha", dest="alpha", type="float", help="parameter alpha", default=numpy.random.gamma(1, 1)) parser.add_option("--gamma", dest="gamma", type="float", help="parameter gamma", default=numpy.random.gamma(1, 1)) parser.add_option("--beta", dest="beta", type="float", help="parameter of beta measure H", default=0.5) parser.add_option("-i", dest="iteration", type="int", help="iteration count", default=10) parser.add_option("-s", dest="stopwords", type="int", help="0=exclude stop words, 1=include stop words", default=1) parser.add_option("--seed", dest="seed", type="int", help="random seed") parser.add_option("--df", dest="df", type="int", help="threshold of document freaquency to cut words", default=0) (options, args) = parser.parse_args() if not (options.filename or options.corpus): parser.error("need corpus filename(-f) or corpus range(-c)") if options.seed != None: numpy.random.seed(options.seed) import vocabulary if options.filename: corpus = vocabulary.load_file(options.filename) else: corpus = vocabulary.load_corpus(options.corpus) if not corpus: parser.error("corpus range(-c) forms 'start:end'") voca = vocabulary.Vocabulary(options.stopwords==0) docs = [voca.doc_to_ids(doc) for doc in corpus] if options.df > 0: docs = voca.cut_low_freq(docs, options.df) hdplda = HDPLDA(options.alpha, options.gamma, options.beta, docs, voca.size()) print "corpus=%d words=%d alpha=%.3f gamma=%.3f beta=%.3f stopwords=%d" % (len(corpus), len(voca.vocas), options.alpha, options.gamma, options.beta, options.stopwords) #hdplda.dump() #import cProfile #cProfile.runctx('hdplda_learning(hdplda, options.iteration)', globals(), locals(), 'hdplda.profile') hdplda_learning(hdplda, options.iteration) output_summary(hdplda, voca)
def main(): import optparse parser = optparse.OptionParser() parser.add_option("-f", dest="filename", help="corpus filename", default="1000_p.txt") parser.add_option("-c", dest="corpus", help="using range of Brown corpus' files(start:end)") parser.add_option("--alpha", dest="alpha", type="float", help="parameter alpha", default=numpy.random.gamma(1, 1)) parser.add_option("--gamma", dest="gamma", type="float", help="parameter gamma", default=numpy.random.gamma(1, 1)) parser.add_option("--beta", dest="beta", type="float", help="parameter of beta measure H", default=0.5) parser.add_option("-i", dest="iteration", type="int", help="iteration count", default=10) parser.add_option("-s", dest="stopwords", type="int", help="0=exclude stop words, 1=include stop words", default=1) parser.add_option("--seed", dest="seed", type="int", help="random seed") parser.add_option("--df", dest="df", type="int", help="threshold of document freaquency to cut words", default=0) (options, args) = parser.parse_args() if not (options.filename or options.corpus): parser.error("need corpus filename(-f) or corpus range(-c)") if options.seed != None: numpy.random.seed(options.seed) import vocabulary if options.filename: corpus = vocabulary.load_file(options.filename) else: corpus = vocabulary.load_corpus(options.corpus) if not corpus: parser.error("corpus range(-c) forms 'start:end'") voca = vocabulary.Vocabulary(options.stopwords == 0) docs = [voca.doc_to_ids(doc) for doc in corpus] if options.df > 0: docs = voca.cut_low_freq(docs, options.df) hdplda = HDPLDA(options.alpha, options.gamma, options.beta, docs, voca.size()) print("corpus=%d words=%d alpha=%.3f gamma=%.3f beta=%.3f stopwords=%d" % (len(corpus), len(voca.vocas), options.alpha, options.gamma, options.beta, options.stopwords)) # hdplda.dump() # import cProfile # cProfile.runctx('hdplda_learning(hdplda, options.iteration)', globals(), locals(), 'hdplda.profile') hdplda_learning(hdplda, options.iteration) output_summary(hdplda, voca)
def main(): import os import pickle import optparse parser = optparse.OptionParser() parser.add_option("-m", dest="model", help="model filename") parser.add_option("-f", dest="filename", help="corpus filename") parser.add_option("-b", dest="corpus", help="using range of Brown corpus' files(start:end)") parser.add_option("--alpha", dest="alpha", type="float", help="parameter alpha", default=0.1) parser.add_option("--beta", dest="beta", type="float", help="parameter beta", default=0.01) parser.add_option("--eta", dest="eta", type="float", help="parameter eta", default=100) parser.add_option("-k", dest="K", type="int", help="number of topics", default=20) parser.add_option("-i", dest="iteration", type="int", help="iteration count", default=10) parser.add_option("--seed", dest="seed", type="int", help="random seed") parser.add_option("--df", dest="df", type="int", help="threshold of document freaquency to cut words", default=0) parser.add_option( "-c", dest="constraint", help="add constraint (wordlist which should belong to the same topic)") parser.add_option("-u", "--unassign", dest="unassign", help="unassign method (all/doc/term/none)", default="none") (options, args) = parser.parse_args() numpy.random.seed(options.seed) if options.model and os.path.exists(options.model): with open(options.model, "rb") as f: lda, voca = pickle.load(f) elif not (options.filename or options.corpus): parser.error( "need corpus filename(-f) or corpus range(-b) or model(-m)") else: import vocabulary if options.filename: corpus = vocabulary.load_file(options.filename) else: corpus = vocabulary.load_corpus(options.corpus) if not corpus: parser.error("corpus range(-c) forms 'start:end'") voca = vocabulary.Vocabulary() docs = [voca.doc_to_ids(doc) for doc in corpus] if options.df > 0: docs = voca.cut_low_freq(docs, options.df) lda = ITM(options.K, options.alpha, options.beta, options.eta, docs, voca.size()) param = (len(lda.docs), len(voca.vocas), options.K, options.alpha, options.beta, options.eta) print "corpus=%d, words=%d, K=%d, a=%f, b=%f, eta=%f" % param if options.constraint: if options.unassign == "all": add_constraint = lda.add_constraint_all elif options.unassign == "doc": add_constraint = lda.add_constraint_doc elif options.unassign == "term": add_constraint = lda.add_constraint_term elif options.unassign == "none": add_constraint = lda.add_constraint_none else: parser.error("unassign method(-u) must be all/doc/term/none") wordlist = options.constraint.split(',') idlist = [voca.vocas_id[w] for w in wordlist] print "\n== add constraint ==" for w, v in zip(idlist, wordlist): print "%s [%s]" % (v, ",".join(str(x) for x in lda.n_k_w[:, w])) add_constraint(idlist) lda.verify_topic() #import cProfile #cProfile.runctx('lda_learning(lda, options.iteration, voca)', globals(), locals(), 'lda.profile') lda_learning(lda, options.iteration, voca) with open(options.model, "wb") as f: pickle.dump((lda, voca), f)
parser.error('need output file(-o)') return (options, args) if __name__ == "__main__": (options, args) = args() import vocabulary # Read corpus from file. if options.filename.split()[-1] == 'json': corpus = vocabulary.load_file_json(options.filename) else: corpus = vocabulary.load_file(options.filename) voca = vocabulary.Vocabulary(options.stopwords==0) docs = [voca.doc_to_ids(doc) for doc in corpus] # Running HDP-LDA. hdplda = HDPLDA(options.K, options.alpha, options.gamma, options.base, docs, voca.size()) print("corpus=%d words=%d alpha=%f gamma=%f base=%f initK=%d stopwords=%d" % (len(corpus), len(voca.vocas), options.alpha, options.gamma, options.base, options.K, options.stopwords)) for i in range(options.iteration): hdplda.inference() print("K: %d" % len(hdplda.topics))
def main(): import optparse import vocabulary parser = optparse.OptionParser() parser.add_option("-f", dest="filename", type="string", help="corpus filename") parser.add_option("-t", dest="time_file", help="timestamp of documents") parser.add_option("-o", dest="output_dir", type="string", help="output directory") parser.add_option("-m", dest="model", help="previously trained model") parser.add_option("--alpha", dest="alpha", type="float", help="parameter alpha", default=0.001) parser.add_option("--beta", dest="beta", type="float", help="parameter beta", default=0.01) parser.add_option("-p", dest="cont", type="float", help="parameter contribution proportion", \ default=0.5) parser.add_option("-k", dest="K", type="int", help="number of topics", default=50) parser.add_option("-i", dest="iteration", type="int", help="iteration count", default=500) parser.add_option("-s", dest="smartinit", action="store_false", \ help="smart initialize of parameters", default=True) parser.add_option("--stopwords", dest="stopwords", help="exclude stop words", \ action="store_true", default=True) parser.add_option("--seed", dest="seed", type="int", help="random seed") parser.add_option("--wf", dest="wf", type="int", \ help="threshold of word frequency to cut words", default=1) parser.add_option("--num-proc", dest="nproc", type="int", help="number of processors", \ default=4) (options, args) = parser.parse_args() if not (options.filename) or \ not (options.time_file) or not(options.output_dir): parser.error("need (corpus filename(-f) and " "document timestamp file(-t) and output directory(-o)") if options.filename: corpus = vocabulary.load_file(options.filename) if options.seed != None: numpy.random.seed(options.seed) if not os.path.exists(options.output_dir): os.makedirs(options.output_dir) voca = vocabulary.Vocabulary(options.stopwords, options.wf) if options.model: (prev_voca, prev_lda) = pickle.load(open(options.model)) #drop one day worth's of data to accommodate the new day's data prev_voca, prev_lda = drop_one_day_data(prev_voca, prev_lda, options.alpha) options.K = prev_lda.K else: prev_lda = None prev_voca = None #generate the vocabularies for voca voca.gen_vocabs(corpus, prev_voca, prev_lda) docs = [voca.doc_to_ids(doc) for doc in corpus] #calculate the number of elements for each timestamp group in docs docs_nt = [] docs_times = [ item.strip() for item in open(options.time_file).readlines() ] tmp_nt = {} for time in set(docs_times): tmp_nt[time] = docs_times.count(time) for (time, count) in sorted(tmp_nt.items()): docs_nt.append(count) tmp_nt.clear() if options.model: #update docs tmp_docs = [] tmp_docs.extend(prev_lda.docs) tmp_docs.extend(docs) docs = tmp_docs #update docs_times tmp_docs_times = [] tmp_docs_times.extend(prev_lda.docs_times) tmp_docs_times.extend(docs_times) docs_times = tmp_docs_times #update docs_nt tmp_docs_nt = [] tmp_docs_nt.extend(prev_lda.docs_nt) tmp_docs_nt.extend(docs_nt) docs_nt = tmp_docs_nt #if options.wf > 0: docs = voca.cut_low_freq(docs, options.wf) #initialise lda lda = LDA(options.K, options.alpha, options.beta, options.cont, docs, docs_nt, voca.size(), \ docs_times, options.output_dir, prev_lda, options.nproc, options.smartinit) #print word frequency freqword = {} freqword_file = open(lda.output_dir + "/freqwords.txt", "w") for (vocab_id, freq) in enumerate(voca.wordfreq): freqword[voca.vocas[vocab_id]] = freq for (vocab, freq) in sorted(freqword.items(), key=operator.itemgetter(1), reverse=True): freqword_file.write(vocab + " " + str(freq) + "\n") freqword_file.flush() print "corpus=%d, words=%d, K=%d, a=%f, b=%f, nproc=%d" % (len(corpus), len(voca.vocas), options.K, options.alpha, options.beta, options.nproc) #import cProfile #cProfile.runctx('lda_learning(lda, options.iteration, voca)', globals(), locals(), 'lda.profile') lda_learning(lda, options.iteration, voca) #save the model for potential re-use later lda.tlock = None pickle.dump((voca, lda), open(options.output_dir + "/model.dat", "w"))
def main(): import optparse import vocabulary parser = optparse.OptionParser() parser.add_option("-f", dest="filename", type="string", help="corpus filename") parser.add_option("-t", dest="time_file", help="timestamp of documents") parser.add_option("-o", dest="output_dir", type="string", help="output directory") parser.add_option("-m", dest="model", help="previously trained model") parser.add_option("--alpha", dest="alpha", type="float", help="parameter alpha", default=0.001) parser.add_option("--beta", dest="beta", type="float", help="parameter beta", default=0.01) parser.add_option("-p", dest="cont", type="float", help="parameter contribution proportion", \ default=0.5) parser.add_option("-k", dest="K", type="int", help="number of topics", default=25) parser.add_option("-i", dest="iteration", type="int", help="iteration count", default=300) parser.add_option("-s", dest="smartinit", action="store_false", \ help="smart initialize of parameters", default=True) parser.add_option("--stopwords", dest="stopwords", help="exclude stop words", \ action="store_true", default=True) parser.add_option("--seed", dest="seed", type="int", help="random seed") parser.add_option("--wf", dest="wf", type="int", \ help="threshold of word frequency to cut words", default=10) parser.add_option("--num-proc", dest="nproc", type="int", help="number of processors", \ default=12) (options, args) = parser.parse_args() if not (options.filename) or \ not (options.time_file) or not(options.output_dir): parser.error("need (corpus filename(-f) and " "document timestamp file(-t) and output directory(-o)") if options.filename: print 'options.filename = ', options.filename corpus = vocabulary.load_file(options.filename) print 'Corpus length = ', len(corpus) if options.seed != None: numpy.random.seed(options.seed) if not os.path.exists(options.output_dir): os.makedirs(options.output_dir) voca = vocabulary.Vocabulary(options.stopwords, options.wf) if options.model: (prev_voca, prev_lda) = pickle.load(open(options.model)) #drop one day worth's of data to accommodate the new day's data prev_voca, prev_lda = drop_one_day_data(prev_voca, prev_lda, options.alpha) options.K = prev_lda.K else: prev_lda = None prev_voca = None #generate the vocabularies for voca voca.gen_vocabs(corpus, prev_voca, prev_lda) docs = [voca.doc_to_ids(doc) for doc in corpus] #calculate the number of elements for each timestamp group in docs docs_nt = [] docs_times = [ item.strip() for item in open(options.time_file).readlines() ] print 'len docs_time=', str(len(docs_times)), 'len docs=', str(len(docs)) # for m, doc in enumerate(docs): # try: # t= docs_times[m] # except: # print 'm=',m, 'len docs_time=', str(len(docs_times)), 'len docs=', str(len(docs)) tmp_nt = {} for time in set(docs_times): tmp_nt[time] = docs_times.count(time) for (time, count) in sorted(tmp_nt.items()): docs_nt.append(count) tmp_nt.clear() if options.model: #update docs tmp_docs = [] tmp_docs.extend(prev_lda.docs) tmp_docs.extend(docs) docs = tmp_docs #update docs_times tmp_docs_times = [] tmp_docs_times.extend(prev_lda.docs_times) tmp_docs_times.extend(docs_times) docs_times = tmp_docs_times #update docs_nt tmp_docs_nt = [] tmp_docs_nt.extend(prev_lda.docs_nt) tmp_docs_nt.extend(docs_nt) docs_nt = tmp_docs_nt #if options.wf > 0: docs = voca.cut_low_freq(docs, options.wf) #initialise lda lda = LDA(options.K, options.alpha, options.beta, options.cont, docs, docs_nt, voca.size(), \ docs_times, options.output_dir, prev_lda, options.nproc, options.smartinit) #print word frequency freqword = {} freqword_file = open(lda.output_dir + "/freqwords.txt", "w") for (vocab_id, freq) in enumerate(voca.wordfreq): freqword[voca.vocas[vocab_id]] = freq for (vocab, freq) in sorted(freqword.items(), key=operator.itemgetter(1), reverse=True): freqword_file.write(vocab + " " + str(freq) + "\n") freqword_file.flush() print "corpus=%d, words=%d, K=%d, a=%f, b=%f, nproc=%d" % (len(corpus), len(voca.vocas), options.K, options.alpha, options.beta, options.nproc) #import cProfile #cProfile.runctx('lda_learning(lda, options.iteration, voca)', globals(), locals(), 'lda.profile') phi_old = lda.worddist() voca_old = voca.vocas lda_learning(lda, options.iteration, voca) phi_new = lda.worddist() fjs = open(lda.output_dir + '/jsdivergence.out','a') oldvoca_file = open(lda.output_dir + "/oldvoca.txt","a") newvoca_file = open(lda.output_dir + "/newvoca.txt","a") oldphi_prob = open(lda.output_dir + "/oldphi.txt","a") newphi_prob = open(lda.output_dir + "/newphi.txt","a") fjs.write('topic# JSdiv') if prev_lda != None: #print '--PREVIOUS word_dist = ----', prev_lda.phi_prob for k in range(lda.K): res = JS.jensenShannonDivergence(phi_old[k],phi_new[k]) #print '\n' + str(k) +' '+ str(res) + ' PREV_phi = ' + str(len(phi_old[k]))+ str(phi_old[k]) + ' NEW_phi= ' + str(len(phi_new[k])) + str(phi_new[k]) +'\n----------\n' fjs.write('\n' + str(k) +' '+ str(res) + ' ' + str(len(phi_old[k])) + ' ' + str(len(phi_new[k]))) oldvoca_file.write('\nTOPIC='+ str(k) + ' WORDS=') for w in numpy.argsort(-phi_old[k])[:1000]: #print i, voca_old[i] oldvoca_file.write(voca_old[w] + " ") oldphi_prob.write(str(phi_old[k][w])+ " ") oldphi_prob.write('\n') oldvoca_file.write('\n') newvoca_file.write('\nTOPIC='+ str(k) + ' WORDS=') for w in numpy.argsort(-phi_new[k])[:1000]: #print i, voca.vocas[i] newvoca_file.write(voca.vocas[w] + " ") newphi_prob.write(str(phi_new[k][w])+ " ") newphi_prob.write('\n') newvoca_file.write('\n') #print '-- SELF word dist = ---', lda.phi_prob fjs.write('\n') #save the model for potential re-use later lda.tlock = None pickle.dump((voca, lda), open(options.output_dir + "/model.dat", "w"))
def main(): t1 = time.time() import optparse import vocabulary global out_dir parser = optparse.OptionParser() parser.add_option("-f", dest="filename", help="corpus filename") parser.add_option("-c", dest="corpus", help="using range of Brown corpus' files(start:end)") parser.add_option("--alpha", dest="alpha", type="float", help="parameter alpha", default=0.1) parser.add_option("--eta", dest="eta", type="float", help="parameter eta", default=0.2) parser.add_option("-k", dest="K", type="int", help="number of topics", default=20) parser.add_option("-i", dest="iteration", type="int", help="iteration count", default=100) parser.add_option("-s", dest="smartinit", action="store_true", help="smart initialize of parameters", default=False) parser.add_option("--stopwords", dest="stopwords", help="exclude stop words", action="store_true", default=False) parser.add_option("--seed", dest="seed", type="int", help="random seed") parser.add_option("--df", dest="df", type="int", help="threshold of document freaquency to cut words", default=0) #parser.add_option("--setup", dest="setup", help="setup details", default="uniform") parser.add_option("--dataset", dest="did", help="setup details : Dataset-1/Dataset-2/Dataset-3", default="Dataset-1") (options, args) = parser.parse_args() if not (options.filename or options.corpus): parser.error("need corpus filename(-f) or corpus range(-c)") if options.filename: if options.did == 'Dataset-1': corpus, doc_ids, event_list, total_no_word = vocabulary.load_file( options.filename) else: corpus, doc_ids, event_list, total_no_word = vocabulary.load_file_reuter( options.filename) else: corpus = vocabulary.load_corpus(options.corpus) if not corpus: parser.error("corpus range(-c) forms 'start:end'") if options.seed != None: numpy.random.seed(options.seed) voca = vocabulary.Vocabulary(options.stopwords) docs = [voca.doc_to_ids(doc) for doc in corpus] if options.df > 0: docs = voca.cut_low_freq(docs, options.df) if event_list is not None: options.K = options.K #len(event_list) suffix = datetime.now().strftime('%Y-%m-%d_%H:%M:%S') #out_dir = '%s/all_words/Topic_%d_alpha_%f_eta_%f_iter_%d/%s' %(out_dir,options.K,options.alpha, options.eta, options.iteration, suffix) #out_dir = '%s/Dataset-1/Topic_%d_alpha_%f_eta_%f_iter_%d/%s' %(out_dir,options.K,options.alpha, options.eta, options.iteration, suffix) out_dir = '%s/%s/Topic_%d_alpha_%f_eta_%f_iter_%d/%s' % ( out_dir, options.did, options.K, options.alpha, options.eta, options.iteration, suffix) #out_dir = '%s/Reuters-21578/R-8-train-train_no-stop/Topic_%d_alpha_%f_eta_%f_iter_%d/%s' %(out_dir,options.K,options.alpha, options.eta, options.iteration, suffix) #out_dir = '%s/20-Newsgroup/20-Newsgroup_train-train_all_term/Topic_%d_alpha_%f_eta_%f_iter_%d/%s' %(out_dir,options.K,options.alpha, options.eta, options.iteration, suffix) print('out_dir: ', out_dir) try: os.makedirs(out_dir) except Exception as e: print(' %s Dir exist ' % (out_dir)) print('E MSG : ', e) lda = LDA(options.K, options.alpha, options.eta, docs, doc_ids, voca.size(), options.smartinit) t_int = time.time() #print 'Intialization time : %f' %(t_int-t1) flog = '%s/log_file.txt' % (out_dir) f = open(flog, 'w') f.write( "corpus(# of doc)=%d, no of event = %d , Uniq words=%d, Toal # of word =%d, K=%d, a=%f, b=%f , iteration = %d \n" % (len(corpus), len(event_list), len(voca.vocas), total_no_word, options.K, options.alpha, options.eta, options.iteration)) f.close() print("corpus=%d, no of event =%d , uniq words=%d, K=%d, a=%f, b=%f" % (len(corpus), len(event_list), len( voca.vocas), options.K, options.alpha, options.eta)), #import cProfile #cProfile.runctx('lda_learning(lda, options.iteration, voca)', globals(), locals(), 'lda.profile') lda_learning(lda, options.iteration, voca) t2 = time.time() print(' Total time taken : %f ' % (t2 - t1)) flog = '%s/log_file.txt' % (out_dir) f = open(flog, 'a') f.write(' TOtal time taken : %f ' % (t2 - t1)) f.close()
parser.error('need output file(-o)') return (options, args) if __name__ == "__main__": (options, args) = args() import vocabulary # Read corpus from file. if options.filename.split()[-1] == 'json': corpus = vocabulary.load_file_json(options.filename) else: corpus = vocabulary.load_file(options.filename) voca = vocabulary.Vocabulary(options.stopwords == 0) docs = [voca.doc_to_ids(doc) for doc in corpus] # Running HDP-LDA. hdplda = HDPLDA(options.K, options.alpha, options.gamma, options.base, docs, voca.size()) print( "corpus=%d words=%d alpha=%f gamma=%f base=%f initK=%d stopwords=%d" % (len(corpus), len(voca.vocas), options.alpha, options.gamma, options.base, options.K, options.stopwords)) for i in range(options.iteration): hdplda.inference()
beta = estimateAlphaMap(nkt, nk, beta, abeta, bbeta) class FileOutput: def __init__(self, file): self.file = file import datetime self.file = file + datetime.datetime.now().strftime('_%m%d_%H%M%S.txt') def out(self, s): with open(self.file, 'a') as f: print >> f, s corpus = vocabulary.load_file('mood.txt') # corpus = vocabulary.load_corpus('1:50') voca = vocabulary.Vocabulary(True) w = [voca.doc_to_ids(doc) for doc in corpus][:86] wq = [voca.doc_to_ids(doc) for doc in corpus][:10] M = len(w) Mq = len(wq) V = voca.size() init() run(niter) initq() runq(niterq) ppx() theta = [[0 for j in range(K)] for i in range(M)] # double
def main(): import os import pickle import optparse parser = optparse.OptionParser() parser.add_option("-m", dest="model", help="model filename") parser.add_option("-f", dest="filename", help="corpus filename") parser.add_option("-b", dest="corpus", help="using range of Brown corpus' files(start:end)") parser.add_option("--alpha", dest="alpha", type="float", help="parameter alpha", default=0.1) parser.add_option("--beta", dest="beta", type="float", help="parameter beta", default=0.01) parser.add_option("--eta", dest="eta", type="float", help="parameter eta", default=100) parser.add_option("-k", dest="K", type="int", help="number of topics", default=20) parser.add_option("-i", dest="iteration", type="int", help="iteration count", default=10) parser.add_option("--seed", dest="seed", type="int", help="random seed") parser.add_option("--df", dest="df", type="int", help="threshold of document freaquency to cut words", default=0) parser.add_option("-c", dest="constraint", help="add constraint (wordlist which should belong to the same topic)") parser.add_option("-u", "--unassign", dest="unassign", help="unassign method (all/doc/term/none)", default="none") (options, args) = parser.parse_args() numpy.random.seed(options.seed) if options.model and os.path.exists(options.model): with open(options.model, "rb") as f: lda, voca = pickle.load(f) elif not (options.filename or options.corpus): parser.error("need corpus filename(-f) or corpus range(-b) or model(-m)") else: import vocabulary if options.filename: corpus = vocabulary.load_file(options.filename) else: corpus = vocabulary.load_corpus(options.corpus) if not corpus: parser.error("corpus range(-c) forms 'start:end'") voca = vocabulary.Vocabulary() docs = [voca.doc_to_ids(doc) for doc in corpus] if options.df > 0: docs = voca.cut_low_freq(docs, options.df) lda = ITM(options.K, options.alpha, options.beta, options.eta, docs, voca.size()) param = (len(lda.docs), len(voca.vocas), options.K, options.alpha, options.beta, options.eta) print "corpus=%d, words=%d, K=%d, a=%f, b=%f, eta=%f" % param if options.constraint: if options.unassign == "all": add_constraint = lda.add_constraint_all elif options.unassign == "doc": add_constraint = lda.add_constraint_doc elif options.unassign == "term": add_constraint = lda.add_constraint_term elif options.unassign == "none": add_constraint = lda.add_constraint_none else: parser.error("unassign method(-u) must be all/doc/term/none") wordlist = options.constraint.split(',') idlist = [voca.vocas_id[w] for w in wordlist] print "\n== add constraint ==" for w, v in zip(idlist, wordlist): print "%s [%s]" % (v, ",".join(str(x) for x in lda.n_k_w[:,w])) add_constraint(idlist) lda.verify_topic() #import cProfile #cProfile.runctx('lda_learning(lda, options.iteration, voca)', globals(), locals(), 'lda.profile') lda_learning(lda, options.iteration, voca) with open(options.model, "wb") as f: pickle.dump((lda, voca), f)
def main(): t1 = time.time() import optparse import vocabulary global out_dir parser = optparse.OptionParser() parser.add_option("-f", dest="filename", help="corpus filename") parser.add_option("-c", dest="corpus", help="using range of Brown corpus' files(start:end)") parser.add_option("--alpha", dest="alpha", type="float", help="parameter alpha", default=0.5) parser.add_option("--beta", dest="beta", type="float", help="parameter beta", default=0.5) parser.add_option("-k", dest="K", type="int", help="number of topics", default=20) parser.add_option("-i", dest="iteration", type="int", help="iteration count", default=100) parser.add_option("-s", dest="smartinit", action="store_true", help="smart initialize of parameters", default=False) parser.add_option("--stopwords", dest="stopwords", help="exclude stop words", action="store_true", default=False) parser.add_option("--seed", dest="seed", type="int", help="random seed") parser.add_option("--df", dest="df", type="int", help="threshold of document freaquency to cut words", default=0) (options, args) = parser.parse_args() if not (options.filename or options.corpus): parser.error("need corpus filename(-f) or corpus range(-c)") if options.filename: corpus, doc_ids, event_list = vocabulary.load_file(options.filename) else: corpus = vocabulary.load_corpus(options.corpus) if not corpus: parser.error("corpus range(-c) forms 'start:end'") if options.seed != None: numpy.random.seed(options.seed) voca = vocabulary.Vocabulary(options.stopwords) docs = [voca.doc_to_ids(doc) for doc in corpus] if options.df > 0: docs = voca.cut_low_freq(docs, options.df) if event_list is not None: options.K = len(event_list) suffix = datetime.now().strftime('%Y-%m-%d_%H:%M:%S') out_dir = '%s/all_words/Topic_%d_alpha_%f_beta_%f_iter_%d/%s' % ( out_dir, options.K, options.alpha, options.beta, options.iteration, suffix) try: os.makedirs(out_dir) except Exception, e: print ' %s Dir exist ' % (out_dir) print 'E MSG : ', e
def main(): import optparse import vocabulary parser = optparse.OptionParser() parser.add_option("--newsf", dest="newsfile", help="news corpus filename") parser.add_option("--tweetsf", dest="tweetsfile", help="tweets corpus filename") parser.add_option("-a", dest="authorfile", help="author filename") parser.add_option("-c", dest="corpus", help="using range of Brown corpus' files(start:end)") parser.add_option("--alpha", dest="alpha", type="float", help="parameter alpha", default=0.5) parser.add_option("--beta", dest="beta", type="float", help="parameter beta", default=0.5) parser.add_option("-k", dest="K", type="int", help="number of topics", default=20) parser.add_option("-i", dest="iteration", type="int", help="iteration count", default=100) parser.add_option("-s", dest="smartinit", action="store_true", help="smart initialize of parameters", default=False) parser.add_option("--stopwords", dest="stopwords", help="exclude stop words", action="store_true", default=False) parser.add_option("--seed", dest="seed", type="int", help="random seed") parser.add_option("--df", dest="df", type="int", help="threshold of document freaquency to cut words", default=0) (options, args) = parser.parse_args() if not (options.newsfile or options.corpus): parser.error("need corpus news file(--newsf) or corpus range(-c)") if not options.tweetsfile: parser.error("need corpus tweets file(--tweetsf)") if not options.authorfile: parser.error("need author file(-a)") if options.newsfile: news_corpus = vocabulary.load_file(options.newsfile) news_len = len(news_corpus) print "Load News data from '" + options.newsfile + "'" print "\t", news_len, "News in total" else: news_corpus = vocabulary.load_corpus(options.corpus) if not news_corpus: parser.error("corpus range(-c) forms 'start:end'") if options.seed != None: np.random.seed(options.seed) voca = vocabulary.Vocabulary(options.stopwords) print "Load Twitters data from '" + options.tweetsfile + "'" ori_twitter_corpus = vocabulary.load_file(options.tweetsfile, 'utf-8') print "Initialize the authors set" num_authors, author_set = vocabulary.load_author(options.authorfile) print "\t", num_authors, "authors in total" # Remove words less frequent twitter_dict = {} for line in ori_twitter_corpus: for w in line: if w in twitter_dict: twitter_dict[w] += 1 else: twitter_dict[w] = 1 twitter_corpus = [] for line in ori_twitter_corpus: for w in line: if twitter_dict[w] < 2: line.remove(w) twitter_corpus.append(line) twitter_corpus = twitter_corpus[:len(author_set)] twitter_len = len(ori_twitter_corpus) print "\t", twitter_len, "Tweets in total" # Whole collection corpus = news_corpus + twitter_corpus # voca = vocabulary.Vocabulary(options.stopwords) docs = [voca.doc_to_ids(doc) for doc in (corpus)] # docs is the documents list [[1,2,3],[4,2,3...]] twitter_words_set = set([w for doc in (twitter_corpus) for w in voca.doc_to_ids(doc)]) # is the Twitter list news_words_set = set([w for doc in (news_corpus) for w in voca.doc_to_ids(doc)]) # is the News list print "Number for Twitter words:", len(twitter_words_set) print "Number of News words:", len(news_words_set) if options.df > 0: docs = voca.cut_low_freq(docs, options.df) corpus_collection = list(set([w for doc in docs for w in doc])) # Initialization print "Initialize the heterogenous topic model" htm = HTM(options.K, options.alpha, options.beta, docs, news_len, num_authors, author_set, voca, twitter_words_set, news_words_set) # Get the results news_wt_distribution, tweets_wt_distribution, htm_wt_distribution, tweets_at_distribution, news_dt_distribution = htm.gibbs_sampling(options.iteration) print "KL from news to htm" KL_divergence(news_wt_distribution, htm_wt_distribution) print "KL from tweets to htm" KL_divergence(tweets_wt_distribution, htm_wt_distribution) print "KL from news to tweets" KL_divergence(news_wt_distribution, tweets_wt_distribution) print "KL from tweets to news" KL_divergence(tweets_wt_distribution, news_wt_distribution) htm.print_top_words(20, news_wt_distribution, voca.vocas) ''' Perplexity ''' perplexity = 0 N = 0 for line in htm_wt_distribution: for v in line: perplexity += np.log(v) N += len(line) print "Perplexity", np.exp(float(-perplexity)/N) htm.print_entropy() f = open(model + "news_wt.txt", "a") for line in news_wt_distribution: for n in line: f.write(str(n) + " ") f.write("\n") f.close() f = open(model + "tweets_wt.txt", "a") for line in tweets_wt_distribution: for n in line: f.write(str(n) + " ") f.write("\n") f.close() f = open(model + "htm_wt.txt", "a") for line in htm_wt_distribution: for n in line: f.write(str(n) + " ") f.write("\n") f.close() f = open(model + "tweets_at.txt", "a") for line in tweets_at_distribution: for n in line: f.write(str(n) + " ") f.write("\n") f.close() f = open(model + "news_dt.txt", "a") for line in news_dt_distribution: for n in line: f.write(str(n) + " ") f.write("\n") f.close()
def main(): import optparse import vocabulary parser = optparse.OptionParser() parser.add_option("-f", dest="filename", type="string", help="corpus filename") parser.add_option("-t", dest="time_file", help="timestamp of documents") parser.add_option("-o", dest="output_dir", type="string", help="output directory") parser.add_option("-m", dest="model", help="previously trained model") parser.add_option("--alpha", dest="alpha", type="float", help="parameter alpha", default=0.001) parser.add_option("--beta", dest="beta", type="float", help="parameter beta", default=0.01) parser.add_option("-p", dest="cont", type="float", help="parameter contribution proportion", \ default=0.5) parser.add_option("-k", dest="K", type="int", help="number of topics", default=50) parser.add_option("-i", dest="iteration", type="int", help="iteration count", default=500) parser.add_option("-s", dest="smartinit", action="store_false", \ help="smart initialize of parameters", default=True) parser.add_option("--stopwords", dest="stopwords", help="exclude stop words", \ action="store_true", default=True) parser.add_option("--seed", dest="seed", type="int", help="random seed") parser.add_option("--wf", dest="wf", type="int", \ help="threshold of word frequency to cut words", default=1) parser.add_option("--num-proc", dest="nproc", type="int", help="number of processors", \ default=4) (options, args) = parser.parse_args() if not (options.filename) or \ not (options.time_file) or not(options.output_dir): parser.error("need (corpus filename(-f) and " "document timestamp file(-t) and output directory(-o)") if options.filename: corpus = vocabulary.load_file(options.filename) if options.seed != None: numpy.random.seed(options.seed) if not os.path.exists(options.output_dir): os.makedirs(options.output_dir) voca = vocabulary.Vocabulary(options.stopwords, options.wf) if options.model: (prev_voca, prev_lda) = pickle.load(open(options.model)) #drop one day worth's of data to accommodate the new day's data prev_voca, prev_lda = drop_one_day_data(prev_voca, prev_lda, options.alpha) options.K = prev_lda.K else: prev_lda = None prev_voca = None #generate the vocabularies for voca voca.gen_vocabs(corpus, prev_voca, prev_lda) docs = [voca.doc_to_ids(doc) for doc in corpus] #calculate the number of elements for each timestamp group in docs docs_nt = [] docs_times = [item.strip() for item in open(options.time_file).readlines()] tmp_nt = {} for time in set(docs_times): tmp_nt[time] = docs_times.count(time) for (time, count) in sorted(tmp_nt.items()): docs_nt.append(count) tmp_nt.clear() if options.model: #update docs tmp_docs = [] tmp_docs.extend(prev_lda.docs) tmp_docs.extend(docs) docs = tmp_docs #update docs_times tmp_docs_times = [] tmp_docs_times.extend(prev_lda.docs_times) tmp_docs_times.extend(docs_times) docs_times = tmp_docs_times #update docs_nt tmp_docs_nt = [] tmp_docs_nt.extend(prev_lda.docs_nt) tmp_docs_nt.extend(docs_nt) docs_nt = tmp_docs_nt #if options.wf > 0: docs = voca.cut_low_freq(docs, options.wf) #initialise lda lda = LDA(options.K, options.alpha, options.beta, options.cont, docs, docs_nt, voca.size(), \ docs_times, options.output_dir, prev_lda, options.nproc, options.smartinit) #print word frequency freqword = {} freqword_file = open(lda.output_dir + "/freqwords.txt", "w") for (vocab_id, freq) in enumerate(voca.wordfreq): freqword[voca.vocas[vocab_id]] = freq for (vocab, freq) in sorted(freqword.items(), key=operator.itemgetter(1), reverse=True): freqword_file.write(vocab + " " + str(freq) + "\n") freqword_file.flush() print "corpus=%d, words=%d, K=%d, a=%f, b=%f, nproc=%d" % ( len(corpus), len( voca.vocas), options.K, options.alpha, options.beta, options.nproc) #import cProfile #cProfile.runctx('lda_learning(lda, options.iteration, voca)', globals(), locals(), 'lda.profile') lda_learning(lda, options.iteration, voca) #save the model for potential re-use later lda.tlock = None pickle.dump((voca, lda), open(options.output_dir + "/model.dat", "w"))
def main(): import optparse import vocabulary parser = optparse.OptionParser() parser.add_option("-f", dest="filename", help="corpus filename") parser.add_option("--alpha", dest="alpha", type="float", help="parameter alpha", default=0.5) parser.add_option("--beta", dest="beta", type="float", help="parameter beta", default=0.5) parser.add_option("--lamda", dest="lamda", type="float", help="parameter lamda", default=0.5) parser.add_option("-k", dest="K", type="int", help="number of topics", default=20) parser.add_option("-i", dest="iteration", type="int", help="iteration count", default=100) parser.add_option("-s", dest="smartinit", action="store_true", help="smart initialize of parameters", default=False) parser.add_option("--stopwords", dest="stopwords", help="exclude stop words", action="store_true", default=False) parser.add_option("--seed", dest="seed", type="int", help="random seed") parser.add_option("--df", dest="df", type="int", help="threshold of document freaquency to cut words", default=0) (options, args) = parser.parse_args() if not (options.filename): parser.error("need corpus filename(-f) or corpus range(-c)") if options.filename: (pids, tids) = vocabulary.load_file(options.filename) if options.seed != None: numpy.random.seed(options.seed) #voca is the object which stores the data structures needed by LDA voca = vocabulary.Vocabulary(options.stopwords) docs = voca.PT_to_idlist(pids, tids) #print docs size_of_vocab = max(tids) + 1 lda = BLDA(options.K, options.alpha, options.beta, options.lamda, docs, size_of_vocab, options.smartinit) #print "corpus=%d, words=%d, K=%d, a=%f, b=%f" % (len(corpus), len(voca.vocas), options.K, options.alpha, options.beta) blda_learning(lda, options.iteration)
def main(): t1 = time.time() import optparse import vocabulary global out_dir parser = optparse.OptionParser() parser.add_option("-f", dest="filename", help="corpus filename") parser.add_option("-c", dest="corpus", help="using range of Brown corpus' files(start:end)") parser.add_option("--alpha", dest="alpha", type="float", help="parameter alpha", default=0.1) parser.add_option("--eta1", dest="eta1", type="float", help="parameter eta for ner word", default=0.4) parser.add_option("--eta2", dest="eta2", type="float", help="parameter eta for Non-ner word", default=0.2) parser.add_option("-k", dest="K", type="int", help="number of topics", default=20) parser.add_option("-i", dest="iteration", type="int", help="iteration count", default=100) parser.add_option("-s", dest="smartinit", action="store_true", help="smart initialize of parameters", default=False) parser.add_option("--stopwords", dest="stopwords", help="exclude stop words", action="store_true", default=False) parser.add_option("--seed", dest="seed", type="int", help="random seed") parser.add_option("--df", dest="df", type="int", help="threshold of document freaquency to cut words", default=0) parser.add_option("--setup", dest="setup", help="setup details : ner_keywords/tf-df-iec/IG", default="ner_keywords") parser.add_option("--dataset", dest="did", help="setup details : Dataset-1/Dataset-2/Dataset-3", default="Dataset-1") (options, args) = parser.parse_args() if not (options.filename or options.corpus): parser.error("need corpus filename(-f) or corpus range(-c)") if options.filename: if options.did == 'Dataset-1': corpus, doc_ids, event_list, total_no_word = vocabulary.load_file( options.filename) else: corpus, doc_ids, event_list, total_no_word = vocabulary.load_file_reuter( options.filename) else: corpus = vocabulary.load_corpus(options.corpus) if not corpus: parser.error("corpus range(-c) forms 'start:end'") if options.seed != None: np.random.seed(options.seed) # fname_sp = options.filename_sp.replace('/', '-') # if 'ner_keywords' in options.setup: # out_dir = '%s/%s/%s/%s_Topic-%d_alpha-%0.2f_eta2-%0.2f_eta1-%0.2f_iter_%d/%s' %(out_dir, options.did, # options.setup, options.did, options.K, options.alpha, options.eta2, options.eta1, options.iteration, suffix) # elif 'tf-df-icf' in options.setup: # out_dir = '%s/%s/%s/%s_Topic-%d_alpha-%0.2f_eta2-%0.2f_eta1-%0.2f_iter_%d/%s' %(out_dir, options.did, # options.setup, options.did, options.K, options.alpha, options.eta2, options.eta1, options.iteration, suffix) # else: # print('Out Directory is not defined') # return # print(' out_dir line 448 : : ' , out_dir) # try: # os.makedirs(out_dir) # except Exception as e: # print(' %s Dir exist ' %(out_dir)) file_name_list = [ options.did, 'Topic-' + str(options.K), 'alpha-' + str(options.alpha), 'eta1-' + str(options.eta1), 'eta2-' + str(options.eta2), 'iter_' + str(options.iteration) ] suffix = datetime.now().strftime('%Y-%m-%d_%H:%M:%S') out_dir = os.path.join(out_dir, options.did, options.setup, '_'.join(file_name_list), suffix) try: os.makedirs(out_dir) except: print('% dir exists !' % (out_dir)) voca = vocabulary.Vocabulary(options.stopwords) docs = [voca.doc_to_ids(doc) for doc in corpus] if options.df > 0: docs = voca.cut_low_freq(docs, options.df) X = np.zeros((len(docs), len(voca.vocas)), dtype=np.int) for i, doc in enumerate(docs): for j, words in enumerate(doc): X[i][words] += 1 for i in range(len(docs)): for j in range(len(voca.vocas)): if X[i][j] < 0: print(' Value less than zero :', i, j, X[i][j], voca.vocas[j]) # Guided LDA with seed topics. # seed_topics = {} # for t_id, st in enumerate(seed_topic_list): # for word in st: # seed_topics[voca.vocas_id[word]] = t_id seed_topics_dir = os.path.join(SEED_DIR, options.did, options.setup) seed_topics_fname = '{}-{}.json'.format(options.did, options.setup) seed_topics_fname_total = os.path.join(seed_topics_dir, seed_topics_fname) seed_topics = load_seed_word(seed_topics_fname_total, voca.vocas_id, event_list) # saving to call graph model = guidedlda.GuidedLDA(n_topics=options.K, n_iter=options.iteration + 1, alpha=options.alpha, eta=options.eta2, random_state=options.K, refresh=20) #model = guidedlda.GuidedLDA(n_topics= options.K, n_iter= options.iteration + 1 , alpha = options.alpha, eta = options.eta2, random_state= options.K, refresh=20) model.fit(X, seed_topics=seed_topics, seed_confidence=options.eta1) # #model.fit(X) # writing to file doc-topic doc_topic = model.transform(X) fout_doc_topic = '%s/doc_topic_dist.txt' % (out_dir) fdoc = open(fout_doc_topic, 'w') st_doc_topic = '' for i, item in enumerate(docs): st_doc_topic += "{} : Topic_{}\n".format(doc_ids[i], doc_topic[i].argmax()) fdoc.write(st_doc_topic) fdoc.close() # Writing to file doc_topic_dist_score.csv topic_list = [] for i in range(options.K): topic_list.append('Topic_%03d' % (i)) print(doc_topic.shape, len(topic_list), len(doc_ids)) df = pd.DataFrame(data=doc_topic, columns=topic_list, index=doc_ids) #print(df.head) fout_doc_topic_score = os.path.join(out_dir, 'doc_topic_dist_score.csv') df.to_csv(fout_doc_topic_score) # Writing to file topic-word n_top_words = 20 topic_word = model.topic_word_ fout_topic_word = '%s/topic_word_dist.txt' % (out_dir) ftopic = open(fout_topic_word, 'w') st_topic_word = '' for i, topic_dist in enumerate(topic_word): word_list = np.array( voca.vocas)[np.argsort(topic_dist)][:-(n_top_words + 1):-1] score_list = np.argsort(topic_dist)[:-(n_top_words + 1):-1] st_topic_word += '\n\n\nTopic : {}\n-------------------\n'.format(i) st = '' for j, word in enumerate(word_list): st += '{}:{}\n'.format(word, topic_dist[score_list[j]]) st_topic_word += st #print(docs) ftopic.write(st_topic_word) ftopic.close()
def main(): import optparse import vocabulary parser = optparse.OptionParser() parser.add_option("-f", dest="filename", help="corpus filename") parser.add_option("-c", dest="corpus", help="using range of Brown corpus' files(start:end)") parser.add_option("--alpha", dest="alpha", type="float", help="parameter alpha", default=0.5) parser.add_option("--beta", dest="beta", type="float", help="parameter beta", default=0.5) parser.add_option("-k", dest="K", type="int", help="number of topics", default=20) parser.add_option("-i", dest="iteration", type="int", help="iteration count", default=100) parser.add_option("-s", dest="smartinit", action="store_true", help="smart initialize of parameters", default=False) parser.add_option("--stopwords", dest="stopwords", help="exclude stop words", action="store_true", default=False) parser.add_option("--seed", dest="seed", type="int", help="random seed") parser.add_option("--df", dest="df", type="int", help="threshold of document freaquency to cut words", default=0) (options, args) = parser.parse_args() if not (options.filename or options.corpus): parser.error("need corpus filename(-f) or corpus range(-c)") if options.filename: corpus = vocabulary.load_file(options.filename) else: corpus = vocabulary.load_corpus(options.corpus) if not corpus: parser.error("corpus range(-c) forms 'start:end'") if options.seed != None: numpy.random.seed(options.seed) voca = vocabulary.Vocabulary(options.stopwords) docs = [voca.doc_to_ids(doc) for doc in corpus] if options.df > 0: docs = voca.cut_low_freq(docs, options.df) lda = LDA(options.K, options.alpha, options.beta, docs, voca.size(), options.smartinit) print ("corpus=%d, words=%d, K=%d, a=%f, b=%f" % (len(corpus), len(voca.vocas), options.K, options.alpha, options.beta)) #import cProfile #cProfile.runctx('lda_learning(lda, options.iteration, voca)', globals(), locals(), 'lda.profile') wt_distribution, dt_distribution = lda_learning(lda, options.iteration, voca) # Entropy entropy = [] num_topics, num_words = wt_distribution.shape num_docs = dt_distribution.shape[1] for t in range(num_topics): probs = 0.0 for doc in docs[:27685]: prob = 0.0 if len(doc) == 0: continue for w in doc: prob -= math.log(wt_distribution[t, w]*1.0) prob = prob/len(doc) probs += prob entropy.append(probs/len(docs[:27685])) print entropy entropy = [] for t in range(num_topics): probs = 0.0 for doc in docs[27685:]: prob = 0.0 if len(doc) == 0: continue for w in doc: prob -= math.log(wt_distribution[t, w]*1.0) prob = prob/len(doc) probs += prob entropy.append(probs/len(docs[27685:])) print entropy entropy = [] for t in range(num_topics): probs = 0.0 for doc in docs: prob = 0.0 if len(doc) == 0: continue for w in doc: prob -= math.log(wt_distribution[t, w]*1.0) prob = prob/len(doc) probs += prob entropy.append(probs/len(docs)) print entropy ''' Perplexity ''' perplexity = 0 N = 0 for line in wt_distribution: for v in line: perplexity += numpy.log(v) N += len(line) print N print "Perplexity", numpy.exp(float(-perplexity)/N) model = "./model_tlda/" print wt_distribution.shape f = open(model + "wt.txt", "a") for line in wt_distribution: for n in line: f.write(str(n) + " ") f.write("\n") f.close() f = open(model + "dt.txt", "a") print dt_distribution.shape for line in dt_distribution: for n in line: f.write(str(n) + " ") f.write("\n") f.close()
def main(): t1= time.time() import optparse global out_dir parser = optparse.OptionParser() parser.add_option("--finp", dest="filename_ip", help="input filename") parser.add_option("--fsp", dest="filename_sp", help="special words filename") parser.add_option("-c", dest="corpus", help="using range of Brown corpus' files(start:end)") parser.add_option("--alpha", dest="alpha", type="float", help="parameter alpha", default=0.5) parser.add_option("--eta1", dest="eta1", type="float", help="parameter eta for ner word", default=0.4) parser.add_option("--eta2", dest="eta2", type="float", help="parameter eta for Non-ner word", default=0.2) # No eta 2 here !! parser.add_option("-k", dest="K", type="int", help="number of topics", default=20) parser.add_option("-i", dest="iteration", type="int", help="iteration count", default=10) parser.add_option("-s", dest="smartinit", action="store_true", help="smart initialize of parameters", default=False) parser.add_option("--stopwords", dest="stopwords", help="exclude stop words", action="store_true", default=False) parser.add_option("--seed", dest="seed", type="int", help="random seed") parser.add_option("--df", dest="df", type="int", help="threshold of document freaquency to cut words", default=0) parser.add_option("--dp", dest="dp", help="ditichlet prior sysmetric or asymmetric ?") parser.add_option("--setup", dest="setup", help="setup details") parser.add_option("--datasets", dest="did", help="setup details",default="dataset_1") (options, args) = parser.parse_args() #if not (options.filename_ip or options.corpus): parser.error("need corpus filename(-f) or corpus range(-c)") if options.filename_ip and options.filename_sp: special_words = init_special_words(options.filename_sp) if options.did == 'Dataset-1': corpus,doc_ids, event_list = vocabulary.load_file(options.filename_ip,special_words) else: corpus,doc_ids, event_list = vocabulary.load_file_reuter(options.filename_ip,special_words) # print(' Line 420 ...') else: options.filename_ip = 'filtered_event_new2.pkl' options.filename_sp = '' special_words = init_special_words(options.filename_sp) corpus,doc_ids, event_list = vocabulary.load_file(options.filename_ip,special_words) # corpus = vocabulary.load_corpus(options.corpus) # if not corpus: parser.error("corpus range(-c) forms 'start:end'") if options.seed != None: numpy.random.seed(options.seed) # print(' Line 430') voca = vocabulary.Vocabulary(options.stopwords,special_words) # print(' Line 432') docs = [voca.doc_to_ids(doc) for doc in corpus] # print(' Line 433') if options.df > 0: docs = voca.cut_low_freq(docs, options.df) if event_list is not None : options.K = options.K #len(event_list) suffix = datetime.now().strftime('%Y-%m-%d_%H:%M:%S') V1,V2 = voca.size() # Total no of uniq word # print(' Initialization of eta 1 started ..') eta1 = initialize_eta1(V1, voca,options.filename_sp) # Modify intialize_eta1 method # print(' Initialization of et1 done !!! ..') # out_dir = '%s/all_words/Topic_%d_alpha_%f_eta1_%f_eta2_%f_iter_%d/%s' %(out_dir,options.K,options.alpha, options.eta1,options.eta2, options.iteration, suffix) # out_dir = '%s/all_words/Topic_%d_alpha_%f_eta2_%f_%s_iter_%d/%s' %(out_dir,options.K,options.alpha,options.eta2,options.filename_sp, options.iteration, suffix) # Modify out_dir fname_sp = options.filename_sp.replace('/', '-') if 'ner_keywords' in options.setup: out_dir = '%s/%s/%s/%s_Topic-%d_alpha-%0.2f_eta2-%0.2f_eta1-%0.2f_iter_%d/%s' %(out_dir, options.did, options.setup, options.did, options.K, options.alpha, options.eta2, options.eta1, options.iteration, suffix) elif 'tf-df-icf' in options.setup: out_dir = '%s/%s/%s/%s_Topic-%d_alpha-%0.2f_eta2-%0.2f_eta1-%0.2f_iter_%d/%s' %(out_dir, options.did, options.setup, options.did, options.K, options.alpha, options.eta2, options.eta1, options.iteration, suffix) elif 'IG' in options.setup: out_dir = '%s/%s/%s/%s_Topic-%d_alpha-%0.2f_eta2-%0.2f_eta1-%0.2f_iter_%d/%s' %(out_dir, options.did, options.setup, options.did, options.K, options.alpha, options.eta2, options.eta1, options.iteration, suffix) else: print('Out Directory is not defined') return print(' out_dir line 448 : : ' , out_dir) try: os.makedirs(out_dir) except Exception as e: print(' %s Dir exist ' %(out_dir)) # print('E MSG : ' , e) # lda = LDA(options.K, options.alpha, options.eta, docs, doc_ids, voca.size(), options.smartinit) print('V1 = %d , V2 = %d ' %(V1,V2)) # How to get V1 and V2 ''' print(' Docs :: ') for i,doc in enumerate(docs): (print 'doc : ' , i, doc) print(' printing Doc Over \n \n ') ''' lda = LDA(options.K, options.alpha, eta1, options.eta2, docs, doc_ids, V1,V2, smartinit=True) # hv to rechechk and modify options.smartint here #Modify here and LDA class flog = '%s/log_file.txt' %(out_dir) f=open(flog,'w') f.write("corpus=%d, V1_ner = %d , V2_Nner =%d, K=%d, alpha=%0.2f , eta_2_Nner = %0.2f, iteration = %d \n" % (len(corpus), V1, V2, options.K, options.alpha, options.eta2, options.iteration)) # Modify here ! f.write('Dataset-%s , input_file = %s, special word file = %s \n' %(options.did, options.filename_ip, options.filename_sp) ) f.close() print("corpus=%d, V1_ner = %d , V2_Nner =%d, K=%d, alpha=%0.2f, eta_2_Nner = %0.2f, iteration = %d \n" % (len(corpus), V1, V2, options.K, options.alpha, options.eta2, options.iteration)) # Modify here @ # import cProfile # cProfile.runctx('lda_learning(lda, options.iteration, voca)', globals(), locals(), 'lda.profile') lda_learning(lda, options.iteration, voca) #check this function t2= time.time() print(' Total time taken : %f ' %(t2-t1)) flog = '%s/log_file.txt' %(out_dir) f=open(flog,'a') f.write(' TOtal time taken : %f ' %(t2-t1)) f.close()