def build_sampler(P, character_count, embedding_size=20, hidden_size=50): P.V = np.random.randn(character_count, embedding_size) P.init_recurrent_1_hidden = 0.5 * np.random.randn(hidden_size) P.init_recurrent_1_cell = 0.5 * np.random.randn(hidden_size) P.init_recurrent_2_hidden = 0.5 * np.random.randn(hidden_size) P.init_recurrent_2_cell = 0.5 * np.random.randn(hidden_size) lstm_layer_1 = lstm.build_step(P, name="recurrent_1", input_size=embedding_size, hidden_size=hidden_size) lstm_layer_2 = lstm.build_step(P, name="recurrent_2", input_size=hidden_size, hidden_size=hidden_size) P.W_output = np.zeros((hidden_size, character_count)) P.b_output = np.zeros((character_count, )) def sampler(temp, x, prev_cell_1, prev_hidden_1, prev_cell_2, prev_hidden_2): input_embedding = P.V[x] cell_1, hidden_1 = lstm_layer_1(input_embedding, prev_cell_1, prev_hidden_1) cell_2, hidden_2 = lstm_layer_2(hidden_1, prev_cell_2, prev_hidden_2) output = U.vector_softmax(temp * (T.dot(hidden_2, P.W_output) + P.b_output)) return output, cell_1, hidden_1, cell_2, hidden_2 return sampler
def build_sampler(P, character_count, embedding_size=20, hidden_size=50): P.V = np.random.randn(character_count, embedding_size) P.init_recurrent_1_hidden = 0.5 * np.random.randn(hidden_size) P.init_recurrent_1_cell = 0.5 * np.random.randn(hidden_size) P.init_recurrent_2_hidden = 0.5 * np.random.randn(hidden_size) P.init_recurrent_2_cell = 0.5 * np.random.randn(hidden_size) lstm_layer_1 = lstm.build_step(P, name="recurrent_1", input_size=embedding_size, hidden_size=hidden_size ) lstm_layer_2 = lstm.build_step(P, name="recurrent_2", input_size=hidden_size, hidden_size=hidden_size ) P.W_output = np.zeros((hidden_size, character_count)) P.b_output = np.zeros((character_count,)) def sampler(temp, x, prev_cell_1, prev_hidden_1, prev_cell_2, prev_hidden_2): input_embedding = P.V[x] cell_1, hidden_1 = lstm_layer_1(input_embedding, prev_cell_1, prev_hidden_1) cell_2, hidden_2 = lstm_layer_2(hidden_1, prev_cell_2, prev_hidden_2) output = U.vector_softmax(temp * (T.dot(hidden_2, P.W_output) + P.b_output)) return output, cell_1, hidden_1, cell_2, hidden_2 return sampler
def build_encoder(P, input_size, hidden_size, latent_size): P.init_encoder_hidden = np.zeros((hidden_size, )) P.init_encoder_cell = np.zeros((hidden_size, )) P.w_encoder_v = np.zeros((hidden_size, )) P.b_encoder_v = 0 rnn_step = lstm.build_step(P, name="encoder", input_sizes=[input_size, latent_size], hidden_size=hidden_size) gaussian_out = vae.build_encoder_output(P, name="encoder_gaussian", input_size=hidden_size, output_size=latent_size, initialise_weights=None) def encode(X, step_count): init_hidden = T.tanh(P.init_encoder_hidden) init_cell = P.init_encoder_cell init_hidden_batch = T.alloc(init_hidden, X.shape[0], hidden_size) init_cell_batch = T.alloc(init_cell, X.shape[0], hidden_size) init_latent = U.theano_rng.normal(size=(X.shape[0], latent_size)) init_z_mean = T.zeros_like(init_latent) init_z_std = T.ones_like(init_latent) eps_seq = U.theano_rng.normal(size=(step_count, X.shape[0], latent_size)) def step(eps, prev_latent, prev_hidden, prev_cell, prev_z_mean, prev_z_std): hidden, cell = rnn_step(X, prev_latent, prev_hidden, prev_cell) _, curr_z_mean, curr_z_std = gaussian_out(hidden) z_mean = curr_z_mean z_std = curr_z_std z_sample = z_mean + eps * z_std return z_sample, hidden, cell, z_mean, z_std [z_samples, hiddens, cells, z_means, z_stds], _ = theano.scan(step, sequences=[eps_seq], outputs_info=[ init_latent, init_hidden_batch, init_cell_batch, init_z_mean, init_z_std ]) alphas = T.exp(T.dot(hiddens, P.w_encoder_v) + P.b_encoder_v + 5) return z_samples, z_means, z_stds, alphas return encode
def build(P, word_rep_size, stmt_hidden_size, diag_hidden_size, vocab_size, output_size, map_fun_size, evidence_count ): vocab_vectors = 0.001 * random_init(vocab_size,word_rep_size) P.vocab = vocab_vectors V = P.vocab encode_qstn = encode_stmt = build_stmt_encoder(P,"stmt",word_rep_size,stmt_hidden_size) #encode_qstn = build_stmt_encoder(P,"qstn",word_rep_size,diag_hidden_size) encode_diag = build_diag_encoder(P, stmt_size = stmt_hidden_size, hidden_size = diag_hidden_size, output_size = diag_hidden_size, encode_stmt = encode_stmt ) qn2keys = lstm.build_step(P,"qn2keys", input_size = diag_hidden_size, hidden_size = diag_hidden_size ) lookup_prep = build_lookup(P, data_size = diag_hidden_size, state_size = diag_hidden_size ) # diag2output = feedforward.build(P,"diag2output", # input_sizes = [diag_hidden_size], # hidden_sizes = [map_fun_size], # output_size = vocab_size # ) P.W_output_vocab = 0.01 * random_init(diag_hidden_size,vocab_size) P.b_output_vocab = 0.00 * np.zeros((vocab_size,)) def qa(story,idxs,qstn): word_feats = V[story] qn_word_feats = V[qstn] diag_cells,diag_hiddens = encode_diag(word_feats,idxs) qn_cell,qn_hidden = encode_qstn(qn_word_feats) lookup = lookup_prep(diag_hiddens) attention = [None] * evidence_count evidence = [None] * evidence_count prev_cell,prev_hidden = qn_cell,qn_hidden prev_attn = 0 alpha = 0.0 input_vec = T.mean(diag_cells,axis=0) for i in xrange(evidence_count): prev_cell, prev_hidden = qn2keys(input_vec,prev_cell,prev_hidden) attention[i] = lookup(prev_hidden,prev_attn) attention[i].name = "attention_%d"%i evidence[i] = input_vec = T.sum(attention[i].dimshuffle(0,'x') * diag_cells,axis=0) # alpha * T.mean(diag_vectors,axis=0) prev_attn = prev_attn + attention[i] final_cell, final_hidden = prev_cell,prev_hidden output = U.vector_softmax(T.dot(final_hidden,P.W_output_vocab) + P.b_output_vocab) return attention,output return qa
def build(P, word_rep_size, stmt_hidden_size, diag_hidden_size, vocab_size, output_size, map_fun_size, evidence_count): vocab_vectors = 0.001 * random_init(vocab_size, word_rep_size) P.vocab = vocab_vectors V = P.vocab encode_qstn = encode_stmt = build_stmt_encoder(P, "stmt", word_rep_size, stmt_hidden_size) #encode_qstn = build_stmt_encoder(P,"qstn",word_rep_size,diag_hidden_size) encode_diag = build_diag_encoder(P, stmt_size=stmt_hidden_size, hidden_size=diag_hidden_size, output_size=diag_hidden_size, encode_stmt=encode_stmt) qn2keys = lstm.build_step(P, "qn2keys", input_size=diag_hidden_size, hidden_size=diag_hidden_size) lookup_prep = build_lookup(P, data_size=diag_hidden_size, state_size=diag_hidden_size) # diag2output = feedforward.build(P,"diag2output", # input_sizes = [diag_hidden_size], # hidden_sizes = [map_fun_size], # output_size = vocab_size # ) P.W_output_vocab = 0.01 * random_init(diag_hidden_size, vocab_size) P.b_output_vocab = 0.00 * np.zeros((vocab_size, )) def qa(story, idxs, qstn): word_feats = V[story] qn_word_feats = V[qstn] diag_cells, diag_hiddens = encode_diag(word_feats, idxs) qn_cell, qn_hidden = encode_qstn(qn_word_feats) lookup = lookup_prep(diag_hiddens) attention = [None] * evidence_count evidence = [None] * evidence_count prev_cell, prev_hidden = qn_cell, qn_hidden prev_attn = 0 alpha = 0.0 input_vec = T.mean(diag_cells, axis=0) for i in xrange(evidence_count): prev_cell, prev_hidden = qn2keys(input_vec, prev_cell, prev_hidden) attention[i] = lookup(prev_hidden, prev_attn) attention[i].name = "attention_%d" % i evidence[i] = input_vec = T.sum(attention[i].dimshuffle(0, 'x') * diag_cells, axis=0) # alpha * T.mean(diag_vectors,axis=0) prev_attn = prev_attn + attention[i] final_cell, final_hidden = prev_cell, prev_hidden output = U.vector_softmax( T.dot(final_hidden, P.W_output_vocab) + P.b_output_vocab) return attention, output return qa
def build(P, input_size, embedding_size, controller_size, stack_size, output_size): softmax_output_size = output_size + 1 P.embeddings = np.random.randn(input_size + 2, embedding_size).astype(np.float64) controller_step = lstm.build_step(P, name="controller", input_size=embedding_size + stack_size, hidden_size=controller_size) stack_init = stack.build(size=stack_size) P.W_controller_output = 0.1 * np.random.randn( controller_size, softmax_output_size + stack_size + 1 + 1).astype( np.float64) bias = np.zeros((softmax_output_size + stack_size + 1 + 1, ), dtype=np.float64) bias[-2] = 5 bias[-1] = -5 P.b_controller_output = bias init_controller_cell = np.zeros((controller_size, ), dtype=np.float64) init_controller_hidden = np.zeros((controller_size, ), dtype=np.float64) init_stack_r = np.zeros((stack_size, ), dtype=np.float64) def predict(ids, aux={}): X = P.embeddings[ids] init_stack_V, init_stack_s, stack_step = stack_init(X.shape[0]) def step(x, t, prev_controller_cell, prev_controller_hidden, prev_V, prev_s, prev_r): controller_input = T.concatenate([x, prev_r]) controller_cell, controller_hidden = \ controller_step( x=controller_input, prev_cell=prev_controller_cell, prev_hidden=prev_controller_hidden ) controller_output = T.dot(controller_hidden, P.W_controller_output) + \ P.b_controller_output output = controller_output[:softmax_output_size] v = T.tanh( controller_output[softmax_output_size:softmax_output_size + stack_size]) flags = T.nnet.sigmoid(controller_output[-2:]) V, s, r = stack_step(t=t, v=v, d=flags[0], u=flags[1], prev_V=prev_V, prev_s=prev_s) return controller_cell, controller_hidden, V, s, r, controller_output, output sequences, _ = theano.scan(step, sequences=[X, T.arange(X.shape[0])], outputs_info=[ init_controller_cell, init_controller_hidden, init_stack_V, init_stack_s, init_stack_r, None, None ]) outputs = T.nnet.softmax(sequences[-1]) aux['controller_output'] = sequences[-2] return outputs return predict
def build(P, name, input_size=200, z_size=200, hidden_layer_size=2500, x_extractor_layers=[600] * 4, z_extractor_layers=[500] * 4, prior_layers=[500] * 4, generation_layers=[600] * 4, inference_layers=[500] * 4): def weight_init(x,y): return np.random.uniform(-0.08, 0.08, (x,y)) X_extractor = feedforward.build_classifier( P, "x_extractor", input_sizes=[input_size], hidden_sizes=x_extractor_layers[:-1], output_size=x_extractor_layers[-1], initial_weights=weight_init, output_initial_weights=weight_init, activation=T.nnet.relu, output_activation=T.nnet.relu ) Z_extractor = feedforward.build_classifier( P, "z_extractor", input_sizes=[z_size], hidden_sizes=z_extractor_layers[:-1], output_size=z_extractor_layers[-1], initial_weights=weight_init, output_initial_weights=weight_init, activation=T.nnet.relu, output_activation=T.nnet.relu ) prior = vae.build_inferer( P, "prior", input_sizes=[hidden_layer_size], hidden_sizes=prior_layers, output_size=z_size, initial_weights=weight_init, activation=T.nnet.relu, initialise_outputs=False ) generate = vae.build_inferer( P, "generator", input_sizes=[hidden_layer_size, z_extractor_layers[-1]], hidden_sizes=generation_layers, output_size=input_size, initial_weights=weight_init, activation=T.nnet.relu, initialise_outputs=False ) P.init_recurrence_hidden = np.zeros((hidden_layer_size,)) P.init_recurrence_cell = np.zeros((hidden_layer_size,)) recurrence = lstm.build_step( P, "recurrence", input_sizes=[x_extractor_layers[-1],z_extractor_layers[-1]], hidden_size=hidden_layer_size ) infer = vae.build_inferer( P, "infer", input_sizes=[hidden_layer_size, x_extractor_layers[-1]], hidden_sizes=generation_layers, output_size=z_size, initial_weights=weight_init, activation=T.nnet.relu, initialise_outputs=False ) def sample(): init_hidden = T.tanh(P.init_recurrence_hidden) init_cell = P.init_recurrence_cell init_hidden_batch = T.alloc(init_hidden, 1, hidden_layer_size) init_cell_batch = T.alloc(init_cell, 1, hidden_layer_size) noise = U.theano_rng.normal(size=(40,1,z_size)) def _step(eps, prev_cell, prev_hidden): _, z_prior_mean, z_prior_logvar = prior([prev_hidden]) z_sample = z_prior_mean + eps * T.exp(0.5 * z_prior_logvar) z_feat = Z_extractor([z_sample]) _, x_mean, _ = generate([prev_hidden, z_feat]) x_feat = X_extractor([x_mean]) curr_cell, curr_hidden = recurrence(x_feat, z_feat, prev_cell, prev_hidden) return curr_cell, curr_hidden, x_mean [cells,hiddens,x_means],_ = theano.scan( _step, sequences=[noise], outputs_info=[init_cell_batch,init_hidden_batch,None], ) return x_means def extract(X,l): init_hidden = T.tanh(P.init_recurrence_hidden) init_cell = P.init_recurrence_cell init_hidden_batch = T.alloc(init_hidden, X.shape[1], hidden_layer_size) init_cell_batch = T.alloc(init_cell, X.shape[1], hidden_layer_size) noise = U.theano_rng.normal(size=(X.shape[0],X.shape[1],z_size)) reset_init_mask = U.theano_rng.binomial(size=(X.shape[0],X.shape[1]),p=0.00) X_feat = X_extractor([X]) def _step(t,x_feat, eps, reset_mask, prev_cell, prev_hidden): reset_mask = reset_mask.dimshuffle(0,'x') _, z_prior_mean, z_prior_logvar = prior([prev_hidden]) _, z_mean, z_logvar = infer([prev_hidden, x_feat]) z_sample = z_mean + eps * T.exp(0.5 * z_logvar) z_feat = Z_extractor([z_sample]) _, x_mean, x_logvar = generate([prev_hidden, z_feat]) curr_cell, curr_hidden = recurrence(x_feat, z_feat, prev_cell, prev_hidden) curr_cell = T.switch( reset_mask, init_cell_batch, curr_cell) curr_hidden = T.switch( reset_mask, init_hidden_batch, curr_hidden) mask = (t < l).dimshuffle(0,'x') return tuple( T.switch(mask,out,0) for out in ( curr_cell, curr_hidden, z_prior_mean, z_prior_logvar, z_sample, z_mean, z_logvar, x_mean, x_logvar )) [_, _, Z_prior_mean, Z_prior_logvar, Z_sample, Z_mean, Z_logvar, X_mean, X_logvar], _ = theano.scan( _step, sequences=[T.arange(X_feat.shape[0]),X_feat,noise,reset_init_mask], outputs_info=[init_cell_batch, init_hidden_batch] + [None] * 7, ) return [ Z_prior_mean, Z_prior_logvar, Z_mean, Z_logvar, X_mean, X_logvar, ] return extract, sample
def build(P, input_size, embedding_size, controller_size, stack_size, output_size): softmax_output_size = output_size + 1 P.embeddings = np.random.randn(input_size + 2,embedding_size).astype(np.float32) controller_step = lstm.build_step( P, name="controller", input_size = embedding_size + stack_size, hidden_size = controller_size ) stack_init = stack.build(size=stack_size) P.W_controller_output = 0.1 * np.random.randn( controller_size, softmax_output_size + stack_size + 1 + 1 ).astype(np.float32) bias = np.zeros((softmax_output_size + stack_size + 1 + 1,), dtype=np.float32) bias[-2] = 5 bias[-1] = -5 P.b_controller_output = bias init_controller_cell = np.zeros((controller_size,), dtype=np.float32) init_controller_hidden = np.zeros((controller_size,), dtype=np.float32) init_stack_r = np.zeros((stack_size,), dtype=np.float32) def predict(ids,aux={}): X = P.embeddings[ids] init_stack_V, init_stack_s, stack_step = stack_init(X.shape[0]) def step(x, t, prev_controller_cell, prev_controller_hidden, prev_V, prev_s, prev_r): controller_input = T.concatenate([x, prev_r]) controller_cell, controller_hidden = \ controller_step( x=controller_input, prev_cell=prev_controller_cell, prev_hidden=prev_controller_hidden ) controller_output = T.dot(controller_hidden, P.W_controller_output) +\ P.b_controller_output output = controller_output[:softmax_output_size] v = T.tanh(controller_output[ softmax_output_size: softmax_output_size + stack_size ]) flags = T.nnet.sigmoid(controller_output[-2:]) V, s, r = stack_step( t=t, v=v, d=flags[0], u=flags[1], prev_V=prev_V, prev_s=prev_s ) return controller_cell, controller_hidden, V, s, r, controller_output, output sequences, _ = theano.scan( step, sequences=[X, T.arange(X.shape[0])], outputs_info=[ init_controller_cell, init_controller_hidden, init_stack_V, init_stack_s, init_stack_r, None, None ] ) outputs = T.nnet.softmax(sequences[-1]) aux['controller_output'] = sequences[-2] return outputs return predict
def build(P, name, input_size=200, z_size=200, hidden_layer_size=2500, x_extractor_layers=[600] * 4, z_extractor_layers=[500] * 4, prior_layers=[500] * 4, generation_layers=[600] * 4, inference_layers=[500] * 4): def weight_init(x, y): return np.random.uniform(-0.08, 0.08, (x, y)) X_extractor = feedforward.build_classifier( P, "x_extractor", input_sizes=[input_size], hidden_sizes=x_extractor_layers[:-1], output_size=x_extractor_layers[-1], initial_weights=weight_init, output_initial_weights=weight_init, activation=T.nnet.relu, output_activation=T.nnet.relu) Z_extractor = feedforward.build_classifier( P, "z_extractor", input_sizes=[z_size], hidden_sizes=z_extractor_layers[:-1], output_size=z_extractor_layers[-1], initial_weights=weight_init, output_initial_weights=weight_init, activation=T.nnet.relu, output_activation=T.nnet.relu) prior = vae.build_inferer(P, "prior", input_sizes=[hidden_layer_size], hidden_sizes=prior_layers, output_size=z_size, initial_weights=weight_init, activation=T.nnet.relu, initialise_outputs=True) generate = vae.build_inferer( P, "generator", input_sizes=[hidden_layer_size, z_extractor_layers[-1]], hidden_sizes=generation_layers, output_size=input_size, initial_weights=weight_init, activation=T.nnet.relu, initialise_outputs=True) P.init_recurrence_hidden = np.zeros((hidden_layer_size, )) P.init_recurrence_cell = np.zeros((hidden_layer_size, )) recurrence = lstm.build_step( P, "recurrence", input_sizes=[x_extractor_layers[-1], z_extractor_layers[-1]], hidden_size=hidden_layer_size) infer = vae.build_inferer( P, "infer", input_sizes=[hidden_layer_size, x_extractor_layers[-1]], hidden_sizes=generation_layers, output_size=z_size, initial_weights=weight_init, activation=T.nnet.relu, initialise_outputs=True) def sample(): init_hidden = T.tanh(P.init_recurrence_hidden) init_cell = P.init_recurrence_cell init_hidden_batch = T.alloc(init_hidden, 1, hidden_layer_size) init_cell_batch = T.alloc(init_cell, 1, hidden_layer_size) noise = U.theano_rng.normal(size=(40, 1, z_size)) def _step(eps, prev_cell, prev_hidden): _, z_prior_mean, z_prior_std = prior([prev_hidden]) z_sample = z_prior_mean + eps * z_prior_std z_feat = Z_extractor([z_sample]) _, x_mean, _ = generate([prev_hidden, z_feat]) x_feat = X_extractor([x_mean]) curr_cell, curr_hidden = recurrence(x_feat, z_feat, prev_cell, prev_hidden) return curr_cell, curr_hidden, x_mean [cells, hiddens, x_means], _ = theano.scan( _step, sequences=[noise], outputs_info=[init_cell_batch, init_hidden_batch, None], ) return x_means def extract(X, l): init_hidden = T.tanh(P.init_recurrence_hidden) init_cell = P.init_recurrence_cell init_hidden_batch = T.alloc(init_hidden, X.shape[1], hidden_layer_size) init_cell_batch = T.alloc(init_cell, X.shape[1], hidden_layer_size) noise = U.theano_rng.normal(size=(X.shape[0], X.shape[1], z_size)) reset_init_mask = U.theano_rng.binomial(size=(X.shape[0], X.shape[1]), p=0.025) X_feat = X_extractor([X]) def _step(t, x_feat, eps, reset_mask, prev_cell, prev_hidden): reset_mask = reset_mask.dimshuffle(0, 'x') _, z_prior_mean, z_prior_std = prior([prev_hidden]) _, z_mean, z_std = infer([prev_hidden, x_feat]) z_sample = z_mean + eps * z_std z_feat = Z_extractor([z_sample]) _, x_mean, x_std = generate([prev_hidden, z_feat]) curr_cell, curr_hidden = recurrence(x_feat, z_feat, prev_cell, prev_hidden) curr_cell = T.switch(reset_mask, init_cell_batch, curr_cell) curr_hidden = T.switch(reset_mask, init_hidden_batch, curr_hidden) mask = (t < l).dimshuffle(0, 'x') return tuple( T.switch(mask, out, 0) for out in (curr_cell, curr_hidden, z_prior_mean, z_prior_std, z_sample, z_mean, z_std, x_mean, x_std)) [ _, _, Z_prior_mean, Z_prior_std, Z_sample, Z_mean, Z_std, X_mean, X_std ], _ = theano.scan( _step, sequences=[ T.arange(X_feat.shape[0]), X_feat, noise, reset_init_mask ], outputs_info=[init_cell_batch, init_hidden_batch] + [None] * 7, ) return [ Z_prior_mean, Z_prior_std, Z_mean, Z_std, X_mean, X_std, ] return extract, sample