import sys
import os
sys.path.append("/n/home13/jbischof/word_rate_prj/wordratemodels/exec/")
from parse_lda_data import *

main_dir = "/n/airoldifs2/lab/jbischof/word_rate_output/"
fake_data_dir = main_dir + "fake_data/"

infilename_lda = fake_data_dir + "fake_data_ldaformat.txt"
outfilename_doclength = fake_data_dir + "doc_length_table.txt"
outfilename_margwc = fake_data_dir + "marg_wc_table.txt"
outfilename_doctopic = fake_data_dir + "doc_topic_list.RData"
outfilename_docwc = fake_data_dir + "doc_word_count_list.RData"
outfilename_featwc = fake_data_dir + "feature_word_count_list.RData"

## Parse LDA data into format that R can read
parse_lda_data(infilename_lda,outfilename_doclength, \
outfilename_margwc,outfilename_docwc,outfilename_featwc, \
docwc_dictname="doc.count.list", featwc_dictname="feature.count.list", \
doc_ids_inc=True,items_skip=0)
# Get partition and cutoff information from command line arguments
# 'partition' is the partition of the dataset (e.g., train, valid, test)
# 'cutoff' is the frequency cutoff for the word candidates (how many times need
# to see in entire corpus to consider for feature selection)
partition = sys.argv[1]
cutoff = sys.argv[2]
main_dir = sys.argv[3]

#main_dir = "/n/airoldifs2/lab/jbischof/word_rate_output/"
data_dir = main_dir + "mmm_raw_data/"
out_dir = data_dir + "parsed_" + str(partition) + "_data" + str(cutoff) + "/"

infilename_lda = data_dir + "reuters_" + str(partition) + "_ldaformat.txt"
infilename_word_candidates = data_dir + "reuters_mmm_kept_word_ids" + str(cutoff) + ".txt"
outfilename_doclength = out_dir + "doc_length_table.txt"
outfilename_doctopic = out_dir + "doc_topic_list.RData"
outfilename_docwc = out_dir + "doc_word_count_list.RData"
outfilename_featwc = out_dir + "feature_word_count_list.RData"
outfilename_docxi = out_dir + "doc_xi_list.RData"
outfilename_eta = out_dir + "eta_vec.txt"

#parse_lda_data(infilename_lda,outfilename_doctopic,outfilename_doclength, \
#infilename_word_candidates, \
#outfilename_docwc,outfilename_featwc,doctopic_dictname="doc.topic.list", \
#docwc_dictname="doc.count.list",featwc_dictname="feature.count.list")

parse_lda_data(infilename_lda, outfilename_doctopic, outfilename_doclength, \
infilename_word_candidates, outfilename_docwc, outfilename_featwc, \
outfilename_docxi, outfilename_eta, doctopic_dictname="doc.topic.list", \
docwc_dictname="doc.count.list", featwc_dictname="feature.count.list", \
docxi_dictname="doc.xi.list")
infilename_selfeat = infilename_word_candidates
infilename_tab = "mmm_topic_address_book_py.txt"
outfilename_mu = out_dir + "initialized_mu.txt"
outfilename_mu_corpus = out_dir + "initialized_corpus_mu.txt"
outfilename_tau2 = out_dir + "initialized_tau2.txt"

# Process thetas
outfilename_theta = out_dir + "initialized_theta.txt"


# Functions

## Parse LDA data into format that R can read
parse_lda_data(infilename_lda, outfilename_doctopic, outfilename_doclength, \
infilename_word_candidates, outfilename_docwc, outfilename_featwc, \
outfilename_docxi, outfilename_eta, doctopic_dictname="doc.topic.list", \
docwc_dictname="doc.count.list", featwc_dictname="feature.count.list", \
docxi_dictname="doc.xi.list")

# Parameter initialization only makes sense for training set
if partition == "train":
   # Initialize tree parameters for all sets of data
   initialize_mmm_params(infilename_selfeat,infilename_tab,infilename_lda, \
   outfilename_mu,outfilename_mu_corpus,outfilename_tau2,L=1)
   
   # Process theta
   initialize_theta_params(infilename_lda,infilename_tab,outfilename_theta)
   
   # Get theta in sparse format
   source5 = "../mmm_process_functions/get_sparse_theta.R"
   cmd5 = "Rscript " + source5 + " " + out_dir
out_dir =  "/n/airoldifs2/lab/jbischof/word_rate_output/"
data_dir = out_dir + "data/"
src_dir = "../exec/"
mod_file = src_dir + "parse_lda_data.py"

import sys
sys.path.append(src_dir)
from parse_lda_data import *

infilename_lda = data_dir + "ap_ldaformat.txt"
outfilename_doclength = data_dir + "ap_doclengths.txt"
outfilename_ragarray = data_dir + "ap_ragarray.txt"
outfilename_margwc = data_dir + "ap_margwc.txt"

parse_lda_data(infilename_lda,outfilename_doclength, \
   outfilename_ragarray,outfilename_margwc)