def main(): # Check input if len(sys.argv) != 5: print"usage: python run_ML_FW.py [train file] [setting file] [model folder] [test data folder]" exit() # Get environment variables train_file = sys.argv[1] setting_file = sys.argv[2] model_folder = sys.argv[3] test_data_folder = sys.argv[4] tops = 10#int(sys.argv[5]) # Create model folder if it doesn't exist if os.path.exists(model_folder): shutil.rmtree(model_folder) os.makedirs(model_folder) # Read settings print'reading setting ...' ddict = utilities.read_setting(setting_file) print'write setting ...' file_name = '%s/setting.txt'%(model_folder) utilities.write_setting(ddict, file_name) # Read data for computing perplexities print'read data for computing perplexities ...' (wordids_1, wordcts_1, wordids_2, wordcts_2) = \ utilities.read_data_for_perpl(test_data_folder) # Initialize the algorithm print'initialize the algorithm ...' ml_fw = ML_FW.MLFW(ddict['num_terms'], ddict['num_topics'], ddict['tau0'], ddict['kappa'], ddict['iter_infer']) # Start print'start!!!' i = 0 while i < ddict['iter_train']: i += 1 print'\n***iter_train:%d***\n'%(i) datafp = open(train_file, 'r') j = 0 while True: j += 1 (wordids, wordcts) = utilities.read_minibatch_list_frequencies(datafp, ddict['batch_size']) # Stop condition if len(wordids) == 0: break # print'---num_minibatch:%d---'%(j) (time_e, time_m, theta) = ml_fw.static_online(ddict['batch_size'], wordids, wordcts) # Compute sparsity sparsity = utilities.compute_sparsity(theta, theta.shape[0], theta.shape[1], 't') # Compute perplexities LD2 = utilities.compute_perplexities_fw(ml_fw.beta, ddict['iter_infer'], \ wordids_1, wordcts_1, wordids_2, wordcts_2) # Search top words of each topics list_tops = utilities.list_top(ml_fw.beta, tops) # Write files utilities.write_file(i, j, ml_fw.beta, time_e, time_m, theta, sparsity, LD2, list_tops, tops, model_folder) datafp.close() # Write final model to file file_name = '%s/beta_final.dat'%(model_folder) utilities.write_topics(ml_fw.beta, file_name) # Finish print'done!!!'
def main(): # Check input if len(sys.argv) != 6: print "usage: python run.py [method name] [train file] [setting file] [model folder] [test data folder]" exit() # Get environment variables method_name = sys.argv[1] train_file = sys.argv[2] setting_file = sys.argv[3] model_folder = sys.argv[4] test_data_folder = sys.argv[5] tops = 10 # int(sys.argv[5]) # Create model folder if it doesn't exist if os.path.exists(model_folder): shutil.rmtree(model_folder) os.makedirs(model_folder) # Read settings print "reading setting ..." settings = utilities.read_setting(setting_file) # Read data for computing perplexities print "read data for computing perplexities ..." test_data = utilities.read_data_for_perpl(test_data_folder) # Check method and run algorithm methods = ["new1ml-ope", "new2ml-ope", "new1online-ope", "new2online-ope", "new1streaming-ope", "new2streaming-ope"] method_low = method_name.lower() if method_low == "new1ml-ope": run_new1_mlope = run_New1ML_OPE.runNew1MLOPE(train_file, settings, model_folder, test_data, tops) run_new1_mlope.run() elif method_low == "new2ml-ope": run_new2_mlope = run_New2ML_OPE.runNew2MLOPE(train_file, settings, model_folder, test_data, tops) run_new2_mlope.run() elif method_low == "new1online-ope": run_new1_onlineope = run_New1Online_OPE.runNew1OnlineOPE(train_file, settings, model_folder, test_data, tops) run_new1_onlineope.run() elif method_low == "new2online-ope": run_new2onlineope = run_New2Online_OPE.runNew2OnlineOPE(train_file, settings, model_folder, test_data, tops) run_new2onlineope.run() elif method_low == "new1streaming-ope": run_new1_streamingope = run_New1Streaming_OPE.runNew1StreamingOPE( train_file, settings, model_folder, test_data, tops ) run_new1_streamingope.run() elif method_low == "new2streaming-ope": run_new2streamingope = run_New2Streaming_OPE.runNew2StreamingOPE( train_file, settings, model_folder, test_data, tops ) run_new2streamingope.run() else: print "\ninput wrong method name: %s\n" % (method_name) print "list of methods:" for method in methods: print "\t\t%s" % (method) exit()
def main(): # Check input if len(sys.argv) != 6: print"usage: python run.py [method name] [train file] [setting file] [model folder] [test data folder]" exit() # Get environment variables method_name = sys.argv[1] train_file = sys.argv[2] setting_file = sys.argv[3] model_folder = sys.argv[4] test_data_folder = sys.argv[5] tops = 10#int(sys.argv[5]) # Create model folder if it doesn't exist if os.path.exists(model_folder): shutil.rmtree(model_folder) os.makedirs(model_folder) # Read settings print'reading setting ...' settings = utilities.read_setting(setting_file) # Read data for computing perplexities print'read data for computing perplexities ...' test_data = utilities.read_data_for_perpl(test_data_folder) # Check method and run algorithm methods = ['new1ml-ope','new2ml-ope', 'new1online-ope','new2online-ope', 'new1streaming-ope', 'new2streaming-ope'] method_low = method_name.lower() if method_low == 'new1ml-ope': run_new1_mlope = run_New1ML_OPE.runNew1MLOPE(train_file, settings, model_folder, test_data, tops) run_new1_mlope.run() elif method_low == 'new2ml-ope': run_new2_mlope = run_New2ML_OPE.runNew2MLOPE(train_file, settings, model_folder, test_data, tops) run_new2_mlope.run() elif method_low == 'new1online-ope': run_new1_onlineope = run_New1Online_OPE.runNew1OnlineOPE(train_file, settings, model_folder, test_data, tops) run_new1_onlineope.run() elif method_low == 'new2online-ope': run_new2onlineope = run_New2Online_OPE.runNew2OnlineOPE(train_file, settings, model_folder, test_data, tops) run_new2onlineope.run() elif method_low == 'new1streaming-ope': run_new1_streamingope = run_New1Streaming_OPE.runNew1StreamingOPE(train_file, settings, model_folder, test_data, tops) run_new1_streamingope.run() elif method_low == 'new2streaming-ope': run_new2streamingope = run_New2Streaming_OPE.runNew2StreamingOPE(train_file, settings, model_folder, test_data, tops) run_new2streamingope.run() else: print '\ninput wrong method name: %s\n'%(method_name) print 'list of methods:' for method in methods: print '\t\t%s'%(method) exit()
def main(): # Check input if len(sys.argv) != 5: print "usage: python run_Online_FW.py [train file] [setting file] [model folder] [test data folder]" exit() # Get environment variables train_file = sys.argv[1] setting_file = sys.argv[2] model_folder = sys.argv[3] test_data_folder = sys.argv[4] tops = 10 # int(sys.argv[5]) # Create model folder if it doesn't exist if os.path.exists(model_folder): shutil.rmtree(model_folder) os.makedirs(model_folder) # Read settings print "reading setting ..." ddict = utilities.read_setting(setting_file) print "write setting ..." file_name = "%s/setting.txt" % (model_folder) utilities.write_setting(ddict, file_name) # Read data for computing perplexities print "read data for computing perplexities ..." (wordids_1, wordcts_1, wordids_2, wordcts_2) = utilities.read_data_for_perpl(test_data_folder) # Initialize the algorithm print "initialize the algorithm ..." online_fw = Online_FW.OnlineFW( ddict["num_docs"], ddict["num_terms"], ddict["num_topics"], ddict["eta"], ddict["tau0"], ddict["kappa"], ddict["iter_infer"], ) # Start print "start!!!" i = 0 while i < ddict["iter_train"]: i += 1 print "\n***iter_train:%d***\n" % (i) datafp = open(train_file, "r") j = 0 while True: j += 1 (wordids, wordcts) = utilities.read_minibatch_list_frequencies(datafp, ddict["batch_size"]) # Stop condition if len(wordids) == 0: break # print "---num_minibatch:%d---" % (j) (time_e, time_m, theta) = online_fw.static_online(ddict["batch_size"], wordids, wordcts) # Compute sparsity sparsity = utilities.compute_sparsity(theta, theta.shape[0], theta.shape[1], "t") # Compute perplexities LD2 = utilities.compute_perplexities_fw( online_fw._lambda, ddict["iter_infer"], wordids_1, wordcts_1, wordids_2, wordcts_2 ) # Search top words of each topics list_tops = utilities.list_top(online_fw._lambda, tops) # Write files utilities.write_file( i, j, online_fw._lambda, time_e, time_m, theta, sparsity, LD2, list_tops, tops, model_folder ) datafp.close() # Write final model to file file_name = "%s/lambda_final.dat" % (model_folder) utilities.write_topics(online_fw._lambda, file_name) # Finish print "done!!!"
def main(): # Check input if len(sys.argv) != 5: print( "usage: python run_ML_OPE.py [train file] [setting file] [model folder] [test data folder]" ) exit() # Get environment variables train_file = sys.argv[1] setting_file = sys.argv[2] model_folder = sys.argv[3] test_data_folder = sys.argv[4] # Create model folder if it doesn't exist if os.path.exists(model_folder): shutil.rmtree(model_folder) os.makedirs(model_folder) # Read settings print('reading setting ...') ddict = utilities.read_setting(setting_file) print('write setting ...') file_name = '%s/setting.txt' % (model_folder) utilities.write_setting(ddict, file_name) # Read data for computing perplexities print('read data for computing perplexities ...') (wordids_1, wordcts_1, wordids_2, wordcts_2) = \ utilities.read_data_for_perpl(test_data_folder) # ============================================= TILL HERE OKAY [0] ============================================= # Initialize the algorithm print('initialize the algorithm ...') ml_ope = ML_OPE.MLOPE(ddict['num_terms'], ddict['num_topics'], ddict['alpha'], ddict['tau0'], ddict['kappa'], ddict['iter_infer']) # Start print('start!!!') i = 0 list_tops = [] while i < ddict['iter_train']: i += 1 print('\n***iter_train:%d***\n' % (i)) datafp = open(train_file, 'r') j = 0 while True: j += 1 (wordids, wordcts) = utilities.read_minibatch_list_frequencies( datafp, ddict['batch_size']) # Stop condition if len(wordids) == 0: break # print('---num_minibatch:%d---' % (j)) (time_e, time_m, theta) = ml_ope.static_online(ddict['batch_size'], wordids, wordcts) # ========================= TILL HERE OKAY [1] ====================================== # Compute sparsity sparsity = utilities.compute_sparsity(theta, theta.shape[0], theta.shape[1], 't') # print(sparsity) # for Testing Sparsity of 1st theta # print(theta[0,:]) # for Testing Sparsity of 1st theta # Compute perplexities # LD2 = utilities.compute_perplexities_vb(ml_ope.beta, ddict['alpha'], ddict['eta'], ddict['iter_infer'], \ # wordids_1, wordcts_1, wordids_2, wordcts_2) LD2 = None # Saving previous list_tops for diff_list_tops() below prev_list_tops = list_tops # Search top words of each topics list_tops = utilities.list_top(ml_ope.beta, ddict['tops']) # TODO: add [last 25% avg diff count] to new file to compare later with other settings # Calculate and print difference between old and current list_tops utilities.diff_list_tops(list_tops, prev_list_tops, i) # Write files utilities.write_file(i, j, ml_ope.beta, time_e, time_m, theta, sparsity, LD2, list_tops, model_folder) datafp.close() # Write final model to file file_name = '%s/beta_final.dat' % (model_folder) utilities.write_topics(ml_ope.beta, file_name) # Finish print('done!!!')
def main(): # Check input if len(sys.argv) != 5: print "usage: python run_ML_FW.py [train file] [setting file] [model folder] [test data folder]" exit() # Get environment variables train_file = sys.argv[1] setting_file = sys.argv[2] model_folder = sys.argv[3] test_data_folder = sys.argv[4] tops = 10 #int(sys.argv[5]) # Create model folder if it doesn't exist if os.path.exists(model_folder): shutil.rmtree(model_folder) os.makedirs(model_folder) # Read settings print 'reading setting ...' ddict = utilities.read_setting(setting_file) print 'write setting ...' file_name = '%s/setting.txt' % (model_folder) utilities.write_setting(ddict, file_name) # Read data for computing perplexities print 'read data for computing perplexities ...' (wordids_1, wordcts_1, wordids_2, wordcts_2) = \ utilities.read_data_for_perpl(test_data_folder) # Initialize the algorithm print 'initialize the algorithm ...' ml_fw = ML_FW.MLFW(ddict['num_terms'], ddict['num_topics'], ddict['tau0'], ddict['kappa'], ddict['iter_infer']) # Start print 'start!!!' i = 0 while i < ddict['iter_train']: i += 1 print '\n***iter_train:%d***\n' % (i) datafp = open(train_file, 'r') j = 0 while True: j += 1 (wordids, wordcts) = utilities.read_minibatch_list_frequencies( datafp, ddict['batch_size']) # Stop condition if len(wordids) == 0: break # print '---num_minibatch:%d---' % (j) (time_e, time_m, theta) = ml_fw.static_online(ddict['batch_size'], wordids, wordcts) # Compute sparsity sparsity = utilities.compute_sparsity(theta, theta.shape[0], theta.shape[1], 't') # Compute perplexities LD2 = utilities.compute_perplexities_fw(ml_fw.beta, ddict['iter_infer'], \ wordids_1, wordcts_1, wordids_2, wordcts_2) # Search top words of each topics list_tops = utilities.list_top(ml_fw.beta, tops) # Write files utilities.write_file(i, j, ml_fw.beta, time_e, time_m, theta, sparsity, LD2, list_tops, tops, model_folder) datafp.close() # Write final model to file file_name = '%s/beta_final.dat' % (model_folder) utilities.write_topics(ml_fw.beta, file_name) # Finish print 'done!!!'