def test(): data_file = sys.argv[1] model_file = sys.argv[2] # import nltk.corpus logging.info("begin to run") t1 = time.time() import vocabulary_for_mglda as vocabulary #corpus = vocabulary.load_corpus_each_sentence("pushed_words.dat") corpus = vocabulary.load_corpus_each_sentence(data_file) t2 = time.time() logging.info("load corpus succeed. cost:%d s", t2-t1) #docs[sentence_idx][word_idx] voca = vocabulary.Vocabulary(True) docs = [voca.doc_to_ids_each_sentence(doc) for doc in corpus] t3 = time.time() logging.info("doc_to_id succeed. cost:%d s", t3-t2) K_gl, K_loc, gamma, alpha_gl, alpha_loc, alpha_mix_gl, alpha_mix_loc, beta_gl, beta_loc, T, docs, W = 50, 10, 0.1, 0.1, 0.1, 0.1, 0.1, 0.1, 0.1, 3, docs, voca.size() mglda = MGLDA(K_gl, K_loc, gamma, alpha_gl, alpha_loc, alpha_mix_gl, alpha_mix_loc, beta_gl, beta_loc, T, docs, W) logging.info("corpus=%d, words=%d, K_gl=%d, K_loc=%d, gamma=%f, alpha_gl=%f, alpha_loc=%f, alpha_mix_gl=%f, alpha_mix_loc=%f, beta_gl=%f, beta_loc=%f" % (len(corpus), len(voca.vocas), K_gl, K_loc, gamma, alpha_gl, alpha_loc, alpha_mix_gl, alpha_mix_loc, beta_gl, beta_loc)) t4 = time.time() logging.info("initialize succeed. cost:%d s", t4-t3) logging.info("begin to learn") out = open(model_file, 'wb') iteration = 1000 mglda_learning(mglda, iteration, voca, out) out.close() logging.info("learn succeed. cost:%d s", time.time()-t4)
def test(): # import nltk.corpus import vocabulary_for_mglda as vocabulary corpus = vocabulary.load_corpus_each_sentence("0:2000") #docs[sentence_idx][word_idx] voca = vocabulary.Vocabulary(True) docs = [voca.doc_to_ids_each_sentence(doc) for doc in corpus] K_gl, K_loc, gamma, alpha_gl, alpha_loc, alpha_mix_gl, alpha_mix_loc, beta_gl, beta_loc, T, docs, W = 50, 10, 0.1, 0.1, 0.1, 0.1, 0.1, 0.1, 0.1, 3, docs, voca.size() mglda = MGLDA(K_gl, K_loc, gamma, alpha_gl, alpha_loc, alpha_mix_gl, alpha_mix_loc, beta_gl, beta_loc, T, docs, W) print "corpus=%d, words=%d, K_gl=%d, K_loc=%d, gamma=%f, alpha_gl=%f, alpha_loc=%f, alpha_mix_gl=%f, alpha_mix_loc=%f, beta_gl=%f, beta_loc=%f" % (len(corpus), len(voca.vocas), K_gl, K_loc, gamma, alpha_gl, alpha_loc, alpha_mix_gl, alpha_mix_loc, beta_gl, beta_loc) iteration = 1000 mglda_learning(mglda, iteration, voca)
def test(): # import nltk.corpus import vocabulary_for_mglda as vocabulary corpus = vocabulary.load_corpus_each_sentence("0:2000") #docs[sentence_idx][word_idx] voca = vocabulary.Vocabulary(True) docs = [voca.doc_to_ids_each_sentence(doc) for doc in corpus] K_gl, K_loc, gamma, alpha_gl, alpha_loc, alpha_mix_gl, alpha_mix_loc, beta_gl, beta_loc, T, docs, W = 50, 10, 0.1, 0.1, 0.1, 0.1, 0.1, 0.1, 0.1, 3, docs, voca.size( ) mglda = MGLDA(K_gl, K_loc, gamma, alpha_gl, alpha_loc, alpha_mix_gl, alpha_mix_loc, beta_gl, beta_loc, T, docs, W) print "corpus=%d, words=%d, K_gl=%d, K_loc=%d, gamma=%f, alpha_gl=%f, alpha_loc=%f, alpha_mix_gl=%f, alpha_mix_loc=%f, beta_gl=%f, beta_loc=%f" % ( len(corpus), len(voca.vocas), K_gl, K_loc, gamma, alpha_gl, alpha_loc, alpha_mix_gl, alpha_mix_loc, beta_gl, beta_loc) iteration = 1000 mglda_learning(mglda, iteration, voca)