def run_online_hdp(): # Command line options. options = parse_args() # Set the random seed. if options.seq_mode: train_file = open(options.data_path) else: train_filenames = glob(options.data_path) train_filenames.sort() num_train_splits = len(train_filenames) # This is used to determine when we reload some another split. num_of_doc_each_split = options.D/num_train_splits # Pick a random split to start # cur_chosen_split = int(random.random() * num_train_splits) cur_chosen_split = 0 # deterministic choice cur_train_filename = train_filenames[cur_chosen_split] c_train = read_data(cur_train_filename) if options.test_data_path is not None: test_data_path = options.test_data_path c_test = read_data(test_data_path) if options.test_data_path_in_folds is not None: test_data_path_in_folds = options.test_data_path_in_folds test_data_in_folds_filenames = glob(test_data_path_in_folds) test_data_in_folds_filenames.sort() num_folds = len(test_data_in_folds_filenames)/2 test_data_train_filenames = [] test_data_test_filenames = [] for i in range(num_folds): test_data_train_filenames.append(test_data_in_folds_filenames[2*i+1]) test_data_test_filenames.append(test_data_in_folds_filenames[2*i]) result_directory = "%s/corpus-%s-kappa-%.1f-tau-%.f-batchsize-%d" % (options.directory, options.corpus_name, options.kappa, options.tau, options.batchsize) print("creating directory %s" % result_directory) if not os.path.isdir(result_directory): os.makedirs(result_directory) options_file = open("%s/options.dat" % result_directory, "w") for opt, value in options.__dict__.items(): options_file.write(str(opt) + " " + str(value) + "\n") options_file.close() print("creating online hdp instance.") responses = open(options.responses).read().splitlines() ohdp = onlinehdp.online_hdp(responses, options.T, options.K, options.D, options.W, options.eta, options.alpha, options.gamma, options.kappa, options.tau, options.scale, options.adding_noise) if options.new_init: ohdp.new_init(c_train) print("setting up counters and log files.") iter = 0 save_lag_counter = 0 total_time = 0.0 total_doc_count = 0 split_doc_count = 0 log_file = open("%s/log.dat" % result_directory, "w") log_file.write("iteration time doc.count score word.count\n") if options.test_data_path is not None: test_log_file = open("%s/test-log.dat" % result_directory, "w") test_log_file.write("iteration time doc.count score word.count score.split word.count.split\n") print("starting online variational inference.") while True: iter += 1 if iter % 10 == 1: print("iteration: %09d" % iter) t0 = time.clock() # Sample the documents. batchsize = options.batchsize if options.seq_mode: c = read_stream_data(train_file, batchsize) batchsize = c.num_docs if batchsize == 0: break docs = c.docs else: ids = random.sample(range(c_train.num_docs), batchsize) docs = [c_train.docs[id] for id in ids] total_doc_count += batchsize split_doc_count += batchsize # Do online inference and evaluate on the fly dataset (score, count) = ohdp.process_documents(docs, options.var_converge) total_time += time.clock() - t0 log_file.write("%d %d %d %.5f %d\n" % (iter, total_time, total_doc_count, score, count)) log_file.flush() # Evaluate on the test data: fixed and folds if total_doc_count % options.save_lag == 0: if not options.fixed_lag and save_lag_counter < 10: save_lag_counter += 1 options.save_lag = options.save_lag * 2 # Save the model. ohdp.save_topics('%s/doc_count-%d.topics' % (result_directory, total_doc_count)) pickle.dump(ohdp, open('%s/doc_count-%d.model' % (result_directory, total_doc_count), 'wb')) # read another split. if not options.seq_mode: if split_doc_count > num_of_doc_each_split * options.pass_ratio and num_train_splits > 1: print("Loading a new split from the training data") split_doc_count = 0 # cur_chosen_split = int(random.random() * num_train_splits) cur_chosen_split = (cur_chosen_split + 1) % num_train_splits cur_train_filename = train_filenames[cur_chosen_split] c_train = read_data(cur_train_filename) if (options.max_iter != -1 and iter >= options.max_iter) or (options.max_time !=-1 and total_time > options.max_time): break log_file.close() print("Saving the final model and topics.") ohdp.save_topics('%s/final.topics' % result_directory) pickle.dump(ohdp, open('%s/final.model' % result_directory, 'wb')) if options.seq_mode: train_file.close() # Makeing final predictions. if options.test_data_path is not None: print("Making predictions.") labels_test = np.array([doc.ys for doc in c_test.docs]) (_, preds, gammas_test) = ohdp.infer_only(c_test.docs) print("HDP") for i in range(ohdp.num_responses()): report = classification_report(labels_test[:,i], preds[:,i]) confusion = confusion_matrix(labels_test[:,i], preds[:,i]) accuracy = accuracy_score(labels_test[:,i], preds[:,i]) print("Accuracy rate : %f" % accuracy) print(report) print(confusion) hamming_accuracy = 1 - hamming_loss(labels_test, preds) print("Hamming accuracy : %f" % hamming_accuracy) labels_train = np.array([doc.ys for doc in c_train.docs]) (_, _, gammas_train) = ohdp.infer_only(c_train.docs) print("Logistic Regression") for i in range(ohdp.num_responses()): clf = LogisticRegression() clf.fit(gammas_train, labels_train[:,i]) preds = clf.predict(gammas_test) report = classification_report(labels_test[:,i], preds) confusion = confusion_matrix(labels_test[:,i], preds) accuracy = accuracy_score(labels_test[:,i], preds) print("Accuracy rate : %f" % accuracy) print(report) print(confusion)
def run_online_hdp(): # Command line options. options = parse_args() # Set the random seed. random.seed(options.random_seed) if options.seq_mode: train_file = file(options.data_path) else: train_filenames = glob(options.data_path) train_filenames.sort() num_train_splits = len(train_filenames) # This is used to determine when we reload some another split. num_of_doc_each_split = options.D/num_train_splits # Pick a random split to start # cur_chosen_split = int(random.random() * num_train_splits) cur_chosen_split = 0 # deterministic choice cur_train_filename = train_filenames[cur_chosen_split] c_train = read_data(cur_train_filename) if options.test_data_path is not None: test_data_path = options.test_data_path c_test = read_data(test_data_path) c_test_word_count = sum([doc.total for doc in c_test.docs]) if options.test_data_path_in_folds is not None: test_data_path_in_folds = options.test_data_path_in_folds test_data_in_folds_filenames = glob(test_data_path_in_folds) test_data_in_folds_filenames.sort() num_folds = len(test_data_in_folds_filenames)/2 test_data_train_filenames = [] test_data_test_filenames = [] for i in range(num_folds): test_data_train_filenames.append(test_data_in_folds_filenames[2*i+1]) test_data_test_filenames.append(test_data_in_folds_filenames[2*i]) c_test_train_folds = [read_data(filename) for filename in test_data_train_filenames] c_test_test_folds = [read_data(filename) for filename in test_data_test_filenames] result_directory = "%s/corpus-%s-kappa-%.1f-tau-%.f-batchsize-%d" % (options.directory, options.corpus_name, options.kappa, options.tau, options.batchsize) print "creating directory %s" % result_directory if not os.path.isdir(result_directory): os.makedirs(result_directory) options_file = file("%s/options.dat" % result_directory, "w") for opt, value in options.__dict__.items(): options_file.write(str(opt) + " " + str(value) + "\n") options_file.close() print "creating online hdp instance." ohdp = onlinehdp.online_hdp(options.T, options.K, options.D, options.W, options.eta, options.alpha, options.gamma, options.kappa, options.tau, options.scale, options.adding_noise) if options.new_init: ohdp.new_init(c_train) print "setting up counters and log files." iter = 0 save_lag_counter = 0 total_time = 0.0 total_doc_count = 0 split_doc_count = 0 doc_seen = set() log_file = file("%s/log.dat" % result_directory, "w") log_file.write("iteration time doc.count score word.count unseen.score unseen.word.count\n") if options.test_data_path is not None: test_log_file = file("%s/test-log.dat" % result_directory, "w") test_log_file.write("iteration time doc.count score word.count score.split word.count.split\n") print "starting online variational inference." while True: iter += 1 if iter % 1000 == 1: print "iteration: %09d" % iter t0 = time.clock() # Sample the documents. batchsize = options.batchsize if options.seq_mode: c = read_stream_data(train_file, batchsize) batchsize = c.num_docs if batchsize == 0: break docs = c.docs unseen_ids = range(batchsize) else: ids = random.sample(range(c_train.num_docs), batchsize) docs = [c_train.docs[id] for id in ids] # Record the seen docs. unseen_ids = set([i for (i, id) in enumerate(ids) if (cur_chosen_split, id) not in doc_seen]) if len(unseen_ids) != 0: doc_seen.update([(cur_chosen_split, id) for id in ids]) total_doc_count += batchsize split_doc_count += batchsize # Do online inference and evaluate on the fly dataset (score, count, unseen_score, unseen_count) = ohdp.process_documents(docs, options.var_converge, unseen_ids) total_time += time.clock() - t0 log_file.write("%d %d %d %.5f %d %.5f %d\n" % (iter, total_time, total_doc_count, score, count, unseen_score, unseen_count)) log_file.flush() # Evaluate on the test data: fixed and folds if total_doc_count % options.save_lag == 0: if not options.fixed_lag and save_lag_counter < 10: save_lag_counter += 1 options.save_lag = options.save_lag * 2 # Save the model. ohdp.save_topics('%s/doc_count-%d.topics' % (result_directory, total_doc_count)) cPickle.dump(ohdp, file('%s/doc_count-%d.model' % (result_directory, total_doc_count), 'w'), -1) if options.test_data_path is not None: print "\tworking on predictions." (lda_alpha, lda_beta) = ohdp.hdp_to_lda() # prediction on the fixed test in folds print "\tworking on fixed test data." test_score = 0.0 test_score_split = 0.0 c_test_word_count_split = 0 for doc in c_test.docs: (likelihood, gamma) = onlinehdp.lda_e_step(doc, lda_alpha, lda_beta) test_score += likelihood (likelihood, count, gamma) = onlinehdp.lda_e_step_split(doc, lda_alpha, lda_beta) test_score_split += likelihood c_test_word_count_split += count test_log_file.write("%d %d %d %.5f %d %.5f %d\n" % (iter, total_time, total_doc_count, test_score, c_test_word_count, test_score_split, c_test_word_count_split)) test_log_file.flush() # read another split. if not options.seq_mode: if split_doc_count > num_of_doc_each_split * options.pass_ratio and num_train_splits > 1: print "Loading a new split from the training data" split_doc_count = 0 # cur_chosen_split = int(random.random() * num_train_splits) cur_chosen_split = (cur_chosen_split + 1) % num_train_splits cur_train_filename = train_filenames[cur_chosen_split] c_train = read_data(cur_train_filename) if (options.max_iter != -1 and iter > options.max_iter) or (options.max_time !=-1 and total_time > options.max_time): break log_file.close() print "Saving the final model and topics." ohdp.save_topics('%s/final.topics' % result_directory) cPickle.dump(ohdp, file('%s/final.model' % result_directory, 'w'), -1) if options.seq_mode: train_file.close() # Makeing final predictions. if options.test_data_path is not None: (lda_alpha, lda_beta) = ohdp.hdp_to_lda() print "\tworking on fixed test data." test_score = 0.0 test_score_split = 0.0 c_test_word_count_split = 0 for doc in c_test.docs: (likelihood, gamma) = onlinehdp.lda_e_step(doc, lda_alpha, lda_beta) test_score += likelihood (likelihood, count, gamma) = onlinehdp.lda_e_step_split(doc, lda_alpha, lda_beta) test_score_split += likelihood c_test_word_count_split += count test_log_file.write("%d %d %d %.5f %d %.5f %d\n" % (iter, total_time, total_doc_count, test_score, c_test_word_count, test_score_split, c_test_word_count_split)) test_log_file.close()
def run_online_hdp(): # Command line options. options = parse_args() # Set the random seed. random.seed(options.random_seed) if options.seq_mode: train_file = open(options.data_path) else: train_filenames = glob(options.data_path) train_filenames.sort() num_train_splits = len(train_filenames) # This is used to determine when we reload some another split. num_of_doc_each_split = options.D/num_train_splits # Pick a random split to start # cur_chosen_split = int(random.random() * num_train_splits) cur_chosen_split = 0 # deterministic choice cur_train_filename = train_filenames[cur_chosen_split] c_train = read_data(cur_train_filename) if options.test_data_path is not None: test_data_path = options.test_data_path c_test = read_data(test_data_path) c_test_word_count = sum([doc.total for doc in c_test.docs]) if options.test_data_path_in_folds is not None: test_data_path_in_folds = options.test_data_path_in_folds test_data_in_folds_filenames = glob(test_data_path_in_folds) test_data_in_folds_filenames.sort() num_folds = len(test_data_in_folds_filenames)/2 test_data_train_filenames = [] test_data_test_filenames = [] for i in range(num_folds): test_data_train_filenames.append(test_data_in_folds_filenames[2*i+1]) test_data_test_filenames.append(test_data_in_folds_filenames[2*i]) c_test_train_folds = [read_data(filename) for filename in test_data_train_filenames] c_test_test_folds = [read_data(filename) for filename in test_data_test_filenames] result_directory = "%s/corpus-%s-kappa-%.1f-tau-%.f-batchsize-%d" % (options.directory, options.corpus_name, options.kappa, options.tau, options.batchsize) print("creating directory %s" % result_directory) if not os.path.isdir(result_directory): os.makedirs(result_directory) options_file = open("%s/options.dat" % result_directory, "w") for opt, value in options.__dict__.items(): options_file.write(str(opt) + " " + str(value) + "\n") options_file.close() print("creating online hdp instance.") ohdp = onlinehdp.online_hdp(options.T, options.K, options.D, options.W, options.eta, options.alpha, options.gamma, options.kappa, options.tau, options.scale, options.adding_noise) if options.new_init: ohdp.new_init(c_train) print("setting up counters and log files.") iter = 0 save_lag_counter = 0 total_time = 0.0 total_doc_count = 0 split_doc_count = 0 doc_seen = set() log_file = open("%s/log.dat" % result_directory, "w") log_file.write("iteration time doc.count score word.count unseen.score unseen.word.count\n") if options.test_data_path is not None: test_log_file = open("%s/test-log.dat" % result_directory, "w") test_log_file.write("iteration time doc.count score word.count score.split word.count.split\n") print("starting online variational inference.") while True: iter += 1 if iter % 1000 == 1: print("iteration: %09d" % iter) t0 = time.clock() # Sample the documents. batchsize = options.batchsize if options.seq_mode: c = read_stream_data(train_file, batchsize) batchsize = c.num_docs if batchsize == 0: break docs = c.docs unseen_ids = range(batchsize) else: ids = random.sample(range(c_train.num_docs), batchsize) docs = [c_train.docs[id] for id in ids] # Record the seen docs. unseen_ids = set([i for (i, id) in enumerate(ids) if (cur_chosen_split, id) not in doc_seen]) if len(unseen_ids) != 0: doc_seen.update([(cur_chosen_split, id) for id in ids]) total_doc_count += batchsize split_doc_count += batchsize # Do online inference and evaluate on the fly dataset (score, count, unseen_score, unseen_count) = ohdp.process_documents(docs, options.var_converge, unseen_ids) total_time += time.clock() - t0 log_file.write("%d %d %d %.5f %d %.5f %d\n" % (iter, total_time, total_doc_count, score, count, unseen_score, unseen_count)) log_file.flush() # Evaluate on the test data: fixed and folds if total_doc_count % options.save_lag == 0: if not options.fixed_lag and save_lag_counter < 10: save_lag_counter += 1 options.save_lag = options.save_lag * 2 # Save the model. ohdp.save_topics('%s/doc_count-%d.topics' % (result_directory, total_doc_count)) pickle.dump(ohdp, open('%s/doc_count-%d.model' % (result_directory, total_doc_count), 'wb'), -1) if options.test_data_path is not None: print("\tworking on predictions.") (lda_alpha, lda_beta) = ohdp.hdp_to_lda() # prediction on the fixed test in folds print("\tworking on fixed test data.") test_score = 0.0 test_score_split = 0.0 c_test_word_count_split = 0 for doc in c_test.docs: (likelihood, gamma) = onlinehdp.lda_e_step(doc, lda_alpha, lda_beta) test_score += likelihood (likelihood, count, gamma) = onlinehdp.lda_e_step_split(doc, lda_alpha, lda_beta) test_score_split += likelihood c_test_word_count_split += count test_log_file.write("%d %d %d %.5f %d %.5f %d\n" % (iter, total_time, total_doc_count, test_score, c_test_word_count, test_score_split, c_test_word_count_split)) test_log_file.flush() # read another split. if not options.seq_mode: if split_doc_count > num_of_doc_each_split * options.pass_ratio and num_train_splits > 1: print("Loading a new split from the training data") split_doc_count = 0 # cur_chosen_split = int(random.random() * num_train_splits) cur_chosen_split = (cur_chosen_split + 1) % num_train_splits cur_train_filename = train_filenames[cur_chosen_split] c_train = read_data(cur_train_filename) if (options.max_iter != -1 and iter > options.max_iter) or (options.max_time !=-1 and total_time > options.max_time): break log_file.close() print("Saving the final model and topics.") ohdp.save_topics('%s/final.topics' % result_directory) pickle.dump(ohdp, open('%s/final.model' % result_directory, 'wb'), -1) if options.seq_mode: train_file.close() # Makeing final predictions. if options.test_data_path is not None: (lda_alpha, lda_beta) = ohdp.hdp_to_lda() print("\tworking on fixed test data.") test_score = 0.0 test_score_split = 0.0 c_test_word_count_split = 0 for doc in c_test.docs: (likelihood, gamma) = onlinehdp.lda_e_step(doc, lda_alpha, lda_beta) test_score += likelihood (likelihood, count, gamma) = onlinehdp.lda_e_step_split(doc, lda_alpha, lda_beta) test_score_split += likelihood c_test_word_count_split += count test_log_file.write("%d %d %d %.5f %d %.5f %d\n" % (iter, total_time, total_doc_count, test_score, c_test_word_count, test_score_split, c_test_word_count_split)) test_log_file.close()
def run_online_hdp(): # Command line options. options = parse_args() # Set the random seed. random.seed(options.random_seed) if options.seq_mode: train_file = file(options.data_path) else: train_filenames = glob(options.data_path) train_filenames.sort() num_train_splits = len(train_filenames) # This is used to determine when we reload some another split. num_of_doc_each_split = options.D / num_train_splits # Pick a random split to start # cur_chosen_split = int(random.random() * num_train_splits) cur_chosen_split = 0 # deterministic choice cur_train_filename = train_filenames[cur_chosen_split] c_train = read_data(cur_train_filename) if options.test_data_path is not None: test_data_path = options.test_data_path c_test = read_data(test_data_path) c_test_word_count = sum([doc.total for doc in c_test.docs]) if options.test_data_path_in_folds is not None: test_data_path_in_folds = options.test_data_path_in_folds test_data_in_folds_filenames = glob(test_data_path_in_folds) test_data_in_folds_filenames.sort() num_folds = len(test_data_in_folds_filenames) / 2 test_data_train_filenames = [] test_data_test_filenames = [] for i in range(num_folds): test_data_train_filenames.append( test_data_in_folds_filenames[2 * i + 1]) test_data_test_filenames.append(test_data_in_folds_filenames[2 * i]) c_test_train_folds = [ read_data(filename) for filename in test_data_train_filenames ] c_test_test_folds = [ read_data(filename) for filename in test_data_test_filenames ] result_directory = "%s/corpus-%s-kappa-%.1f-tau-%.f-batchsize-%d" % ( options.directory, options.corpus_name, options.kappa, options.tau, options.batchsize) print "%s creating directory %s" % (getTime(), result_directory) if not os.path.isdir(result_directory): os.makedirs(result_directory) options_file = file("%s/options.dat" % result_directory, "w") for opt, value in options.__dict__.items(): options_file.write(str(opt) + " " + str(value) + "\n") options_file.close() if options.init_model_path is None: print "%s creating online hdp instance." % getTime() ohdp = onlinehdp.online_hdp(options.T, options.K, options.D, options.W, options.eta, options.alpha, options.gamma, options.kappa, options.tau, options.scale, options.adding_noise) cPickle.dump(ohdp, file('%s/init.model' % result_directory, 'wb'), -1) else: print "%s read online hdp instance from %s." % ( getTime(), options.init_model_path) ohdp = cPickle.load(file(options.init_model_path, 'rb')) if options.new_init: ohdp.new_init(c_train) print "%s setting up counters and log files." % getTime() iter = 0 save_lag_counter = 0 total_time = 0.0 total_doc_count = 0 split_doc_count = 0 doc_seen = set() log_file = file("%s/log.dat" % result_directory, "w") log_file.write( "iteration time doc.count score word.count unseen.score unseen.word.count\n" ) if options.test_data_path is not None: test_log_file = file("%s/test-log.dat" % result_directory, "w") test_log_file.write( "iteration time doc.count score word.count score.split word.count.split\n" ) print "%s starting online variational inference." % getTime() while True: iter += 1 # if iter % 1000 == 1: print "%s iteration: %09d" % (getTime(), iter) t0 = time.clock() # Sample the documents. batchsize = options.batchsize if options.seq_mode: # 顺序读取整个训练语料 print "%s [iter %d] read batch data" % (getTime(), iter) if not options.add_time_location: c = read_stream_data(train_file, batchsize) # 读取 else: c = read_stream_data_time_location(train_file, batchsize) batchsize = c.num_docs # batchsize更新为这次读取了的文档的个数(可能比之前要小) if batchsize == 0: # 如果读完了就跳出 break docs = c.docs unseen_ids = range(batchsize) else: # 训练文档保存在不同文件时,随机读取整个语料,c_train是当前训练的文件对应的corpus ids = random.sample(range(c_train.num_docs), batchsize) # 从c_train中取batchsize个文档 docs = [c_train.docs[id] for id in ids] # Record the seen docs. # 把没有出现在doc_seen中的文档记录下来 unseen_ids = set([ i for (i, id) in enumerate(ids) if (cur_chosen_split, id) not in doc_seen ]) if len(unseen_ids) != 0: doc_seen.update([ (cur_chosen_split, id) for id in ids ]) # 如果这个batch中含有不在doc_seen中的文档,则将这个文档放在doc_seen中 total_doc_count += batchsize split_doc_count += batchsize # Do online inference and evaluate on the fly dataset # 并不是之前取过的文档就不取了,也有可能会取到 print "%s [iter %d] processing %d docs." % (getTime(), iter, total_doc_count) (score, count, unseen_score, unseen_count) = ohdp.process_documents(total_doc_count - batchsize, docs, options.var_converge, unseen_ids) total_time += time.clock() - t0 log_file.write("%d %d %d %.5f %d %.5f %d\n" % (iter, total_time, total_doc_count, score, count, unseen_score, unseen_count)) log_file.flush() # Evaluate on the test data: fixed and folds # 注意这里只有整除才会保存模型 if total_doc_count % options.save_lag == 0: if not options.fixed_lag and save_lag_counter < 10: save_lag_counter += 1 options.save_lag = options.save_lag * 2 # Save the model. print "%s save doc_count-%d.topics" % (getTime(), total_doc_count) ohdp.save_topics('%s/doc_count-%d.topics' % (result_directory, total_doc_count)) cPickle.dump( ohdp, file( '%s/doc_count-%d.model' % (result_directory, total_doc_count), 'wb'), -1) if options.test_data_path is not None: print "\t %s working on predictions." % getTime() (lda_alpha, lda_beta) = ohdp.hdp_to_lda() # prediction on the fixed test in folds print "\t %s working on fixed test data." % getTime() test_score = 0.0 test_score_split = 0.0 c_test_word_count_split = 0 for doc in c_test.docs: (likelihood, gamma) = onlinehdp.lda_e_step(doc, lda_alpha, lda_beta) test_score += likelihood (likelihood, count, gamma) = onlinehdp.lda_e_step_split( doc, lda_alpha, lda_beta) test_score_split += likelihood c_test_word_count_split += count test_log_file.write( "%d %d %d %.5f %d %.5f %d\n" % (iter, total_time, total_doc_count, test_score, c_test_word_count, test_score_split, c_test_word_count_split)) test_log_file.flush() # read another split. if not options.seq_mode: if split_doc_count > num_of_doc_each_split * options.pass_ratio and num_train_splits > 1: print "%s Loading a new split from the training data" % getTime( ) split_doc_count = 0 # cur_chosen_split = int(random.random() * num_train_splits) cur_chosen_split = (cur_chosen_split + 1) % num_train_splits cur_train_filename = train_filenames[cur_chosen_split] c_train = read_data(cur_train_filename) # if (options.max_iter != -1 and iter > options.max_iter) or ( # options.max_time != -1 and total_time > options.max_time): # break log_file.close() print "%s Saving the final model and topics." % getTime() ohdp.save_topics('%s/final.topics' % result_directory) cPickle.dump(ohdp, file('%s/final.model' % result_directory, 'wb'), -1) if options.seq_mode: train_file.close() # Makeing final predictions. if options.test_data_path is not None: (lda_alpha, lda_beta) = ohdp.hdp_to_lda() print "\t %s working on fixed test data." % getTime() test_score = 0.0 test_score_split = 0.0 c_test_word_count_split = 0 for doc in c_test.docs: (likelihood, gamma) = onlinehdp.lda_e_step(doc, lda_alpha, lda_beta) test_score += likelihood (likelihood, count, gamma) = onlinehdp.lda_e_step_split(doc, lda_alpha, lda_beta) test_score_split += likelihood c_test_word_count_split += count test_log_file.write( "%d %d %d %.5f %d %.5f %d\n" % (iter, total_time, total_doc_count, test_score, c_test_word_count, test_score_split, c_test_word_count_split)) test_log_file.close()