def data_gen(): ### IN ## experiment stuff np.random.seed(1234) data_dir = 'data' n_exp = 1 # number of experiments ## LDA stuff W = 25 # word vocabulary L = int(np.sqrt(W)) # image size T = 2*L # topics D = 1000 # documents N = 100 # words per document alpha = 1. # hyper-param for mixture of topics (theta) beta = 1. # hyper-param for topic distribs (phi), # used only as param in pb # stan chains = 1 # phi is given as the horizontal and vertical topics on the 5X5 images phi = [np.zeros((L, L)) for i in range(T)] line = 0 for phi_t in phi: if line >= L: trueLine = int(line - L) phi_t[:,trueLine] = 1./L*np.ones(L) else: phi_t[line] = 1./L*np.ones(L) line += 1 rewrite_dir(data_dir) [write_data(data_dir, i, W, L, T, D, N, phi, alpha, beta, chains) for i in range(n_exp)]
def data_gen(): K = 6 beta = 50./float(K) gamma = 0.1 fin_name = 'data/sushi3/sushi3a.5000.10.order' fout_name = 'data/sushi3/sushi3.pb' rewrite_dir('data') call_cmd('sh get_data.sh') N, data = read_data(fin_name) write_pb(fout_name, N, data, K, beta, gamma)
def data_gen(seed, n_exp, W, L, T, D, N, alpha, beta): np.random.seed(seed) data_dir = 'data' # phi is given as the horizontal and vertical topics on the 5X5 images phi = [np.zeros((L, L)) for i in range(T)] line = 0 for phi_t in phi: if line >= L: trueLine = int(line - L) phi_t[:,trueLine] = 1./L*np.ones(L) else: phi_t[line] = 1./L*np.ones(L) line += 1 rewrite_dir(data_dir) plot_topics(T, phi, 'data/lda_ground_phi') [write_pb_txt(data_dir, i, W, L, T, D, N, phi, alpha, beta) for i in range(n_exp)]