import sys import os sys.path.append("/n/home13/jbischof/word_rate_prj/wordratemodels/exec/") from parse_lda_data import * main_dir = "/n/airoldifs2/lab/jbischof/word_rate_output/" fake_data_dir = main_dir + "fake_data/" infilename_lda = fake_data_dir + "fake_data_ldaformat.txt" outfilename_doclength = fake_data_dir + "doc_length_table.txt" outfilename_margwc = fake_data_dir + "marg_wc_table.txt" outfilename_doctopic = fake_data_dir + "doc_topic_list.RData" outfilename_docwc = fake_data_dir + "doc_word_count_list.RData" outfilename_featwc = fake_data_dir + "feature_word_count_list.RData" ## Parse LDA data into format that R can read parse_lda_data(infilename_lda,outfilename_doclength, \ outfilename_margwc,outfilename_docwc,outfilename_featwc, \ docwc_dictname="doc.count.list", featwc_dictname="feature.count.list", \ doc_ids_inc=True,items_skip=0)
# Get partition and cutoff information from command line arguments # 'partition' is the partition of the dataset (e.g., train, valid, test) # 'cutoff' is the frequency cutoff for the word candidates (how many times need # to see in entire corpus to consider for feature selection) partition = sys.argv[1] cutoff = sys.argv[2] main_dir = sys.argv[3] #main_dir = "/n/airoldifs2/lab/jbischof/word_rate_output/" data_dir = main_dir + "mmm_raw_data/" out_dir = data_dir + "parsed_" + str(partition) + "_data" + str(cutoff) + "/" infilename_lda = data_dir + "reuters_" + str(partition) + "_ldaformat.txt" infilename_word_candidates = data_dir + "reuters_mmm_kept_word_ids" + str(cutoff) + ".txt" outfilename_doclength = out_dir + "doc_length_table.txt" outfilename_doctopic = out_dir + "doc_topic_list.RData" outfilename_docwc = out_dir + "doc_word_count_list.RData" outfilename_featwc = out_dir + "feature_word_count_list.RData" outfilename_docxi = out_dir + "doc_xi_list.RData" outfilename_eta = out_dir + "eta_vec.txt" #parse_lda_data(infilename_lda,outfilename_doctopic,outfilename_doclength, \ #infilename_word_candidates, \ #outfilename_docwc,outfilename_featwc,doctopic_dictname="doc.topic.list", \ #docwc_dictname="doc.count.list",featwc_dictname="feature.count.list") parse_lda_data(infilename_lda, outfilename_doctopic, outfilename_doclength, \ infilename_word_candidates, outfilename_docwc, outfilename_featwc, \ outfilename_docxi, outfilename_eta, doctopic_dictname="doc.topic.list", \ docwc_dictname="doc.count.list", featwc_dictname="feature.count.list", \ docxi_dictname="doc.xi.list")
infilename_selfeat = infilename_word_candidates infilename_tab = "mmm_topic_address_book_py.txt" outfilename_mu = out_dir + "initialized_mu.txt" outfilename_mu_corpus = out_dir + "initialized_corpus_mu.txt" outfilename_tau2 = out_dir + "initialized_tau2.txt" # Process thetas outfilename_theta = out_dir + "initialized_theta.txt" # Functions ## Parse LDA data into format that R can read parse_lda_data(infilename_lda, outfilename_doctopic, outfilename_doclength, \ infilename_word_candidates, outfilename_docwc, outfilename_featwc, \ outfilename_docxi, outfilename_eta, doctopic_dictname="doc.topic.list", \ docwc_dictname="doc.count.list", featwc_dictname="feature.count.list", \ docxi_dictname="doc.xi.list") # Parameter initialization only makes sense for training set if partition == "train": # Initialize tree parameters for all sets of data initialize_mmm_params(infilename_selfeat,infilename_tab,infilename_lda, \ outfilename_mu,outfilename_mu_corpus,outfilename_tau2,L=1) # Process theta initialize_theta_params(infilename_lda,infilename_tab,outfilename_theta) # Get theta in sparse format source5 = "../mmm_process_functions/get_sparse_theta.R" cmd5 = "Rscript " + source5 + " " + out_dir
out_dir = "/n/airoldifs2/lab/jbischof/word_rate_output/" data_dir = out_dir + "data/" src_dir = "../exec/" mod_file = src_dir + "parse_lda_data.py" import sys sys.path.append(src_dir) from parse_lda_data import * infilename_lda = data_dir + "ap_ldaformat.txt" outfilename_doclength = data_dir + "ap_doclengths.txt" outfilename_ragarray = data_dir + "ap_ragarray.txt" outfilename_margwc = data_dir + "ap_margwc.txt" parse_lda_data(infilename_lda,outfilename_doclength, \ outfilename_ragarray,outfilename_margwc)