def head_params(x): # key key_t = T.dot(x, P["W_%d_key" % id]) + P["b_%d_key" % id] # shift shift_t = U.vector_softmax( T.dot(x, P["W_%d_shift" % id]) + P["b_%d_shift" % id]) shift_t.name = "shift_t" # scalars _beta_t = T.dot(x, P["W_%d_beta" % id]) + P["b_%d_beta" % id] _gamma_t = T.dot(x, P["W_%d_gamma" % id]) + P["b_%d_gamma" % id] beta_t = T.nnet.softplus(_beta_t) gamma_t = T.nnet.softplus(_gamma_t) + 1. # beta_t = (_beta_t > 0)*_beta_t # gamma_t = (_gamma_t > 0)*_gamma_t + 1. # beta_t = T.exp(_beta_t) # gamma_t = T.exp(_gamma_t) + 1. g_t = T.nnet.sigmoid(T.dot(x, P["W_%d_g" % id]) + P["b_%d_g" % id]) erase_t = T.nnet.sigmoid( T.dot(x, P["W_%d_erase" % id]) + P["b_%d_erase" % id]) add_t = T.dot(x, P["W_%d_add" % id]) + P["b_%d_add" % id] return key_t, beta_t, g_t, shift_t, gamma_t, erase_t, add_t
def build_head_curr(weight_prev, M_curr, head, input_curr): """ Implement addressing mechanism shown in Figure 2 in paper. Also return add and erase vectors computed by head. """ # input_curr is hidden layer from controller # this is passing the hidden layer into the heads layer # which computes key, beta, g, shift, gamma, erase, and add # as outputs (see head_params in head.py) key, beta, g, shift, gamma, erase, add = head(input_curr) # 3.3.1 Focusing b Content (Equation (5)) weight_c = U.vector_softmax(beta * similarity(key, M_curr)) weight_c.name = "weight_c" # 3.3.2 Focusing by Location (Equation (7)) weight_g = g * weight_c + (1 - g) * weight_prev weight_g.name = "weight_g" # Equation (8) weight_shifted = shift_convolve(weight_g, shift) # Equation (9) weight_sharp = weight_shifted**gamma weight_curr = weight_sharp / T.sum(weight_sharp) return weight_curr, erase, add
def head_params(x): """ Takes hidden layer from controller computes k_t, beta_t, g_t, s_t, and erase and add vectors as outputs """ # key key_t = T.dot(x,P["W_key"]) + P["b_key"] # key strength _beta_t = T.dot(x,P["W_beta"]) + P["b_beta"] beta_t = T.nnet.softplus(_beta_t) # interpolation gate g_t = T.nnet.sigmoid(T.dot(x,P["W_g"]) + P["b_g"]) # shift shift_t = U.vector_softmax(T.dot(x,P["W_shift"]) + P["b_shift"]) shift_t.name = "shift_t" # sharpening _gamma_t = T.dot(x,P["W_gamma"]) + P["b_gamma"] gamma_t = T.nnet.softplus(_gamma_t) + 1. # erase and add vectors erase_t = T.nnet.sigmoid(T.dot(x,P["W_erase"]) + P["b_erase"]) add_t = T.dot(x,P["W_add"]) + P["b_add"] return key_t,beta_t,g_t,shift_t,gamma_t,erase_t,add_t
def qa(story,idxs,qstn): word_feats = V[story] qn_word_feats = V[qstn] diag_cells,diag_hiddens = encode_diag(word_feats,idxs) qn_cell,qn_hidden = encode_qstn(qn_word_feats) lookup = lookup_prep(diag_hiddens) attention = [None] * evidence_count evidence = [None] * evidence_count prev_cell,prev_hidden = qn_cell,qn_hidden prev_attn = 0 alpha = 0.0 input_vec = T.mean(diag_cells,axis=0) for i in xrange(evidence_count): prev_cell, prev_hidden = qn2keys(input_vec,prev_cell,prev_hidden) attention[i] = lookup(prev_hidden,prev_attn) attention[i].name = "attention_%d"%i evidence[i] = input_vec = T.sum(attention[i].dimshuffle(0,'x') * diag_cells,axis=0) # alpha * T.mean(diag_vectors,axis=0) prev_attn = prev_attn + attention[i] final_cell, final_hidden = prev_cell,prev_hidden output = U.vector_softmax(T.dot(final_hidden,P.W_output_vocab) + P.b_output_vocab) return attention,output
def build_head_curr(weight_prev,M_curr,head,input_curr): """ Implement addressing mechanism shown in Figure 2 in paper. Also return add and erase vectors computed by head. """ # input_curr is hidden layer from controller # this is passing the hidden layer into the heads layer # which computes key, beta, g, shift, gamma, erase, and add # as outputs (see head_params in head.py) key,beta,g,shift,gamma,erase,add = head(input_curr) # 3.3.1 Focusing b Content (Equation (5)) weight_c = U.vector_softmax(beta * similarity(key,M_curr)) weight_c.name = "weight_c" # 3.3.2 Focusing by Location (Equation (7)) weight_g = g * weight_c + (1 - g) * weight_prev weight_g.name = "weight_g" # Equation (8) weight_shifted = shift_convolve(weight_g,shift) # Equation (9) weight_sharp = weight_shifted ** gamma weight_curr = weight_sharp / T.sum(weight_sharp) return weight_curr,erase,add
def qa(story, idxs, qstn): word_feats = V[story] qn_word_feats = V[qstn] diag_cells, diag_hiddens = encode_diag(word_feats, idxs) qn_cell, qn_hidden = encode_qstn(qn_word_feats) lookup = lookup_prep(diag_hiddens) attention = [None] * evidence_count evidence = [None] * evidence_count prev_cell, prev_hidden = qn_cell, qn_hidden prev_attn = 0 alpha = 0.0 input_vec = T.mean(diag_cells, axis=0) for i in xrange(evidence_count): prev_cell, prev_hidden = qn2keys(input_vec, prev_cell, prev_hidden) attention[i] = lookup(prev_hidden, prev_attn) attention[i].name = "attention_%d" % i evidence[i] = input_vec = T.sum(attention[i].dimshuffle(0, 'x') * diag_cells, axis=0) # alpha * T.mean(diag_vectors,axis=0) prev_attn = prev_attn + attention[i] final_cell, final_hidden = prev_cell, prev_hidden output = U.vector_softmax( T.dot(final_hidden, P.W_output_vocab) + P.b_output_vocab) return attention, output
def head_params(x): """ Takes hidden layer from controller computes k_t, beta_t, g_t, s_t, and erase and add vectors as outputs """ # key key_t = T.dot(x, P["W_key"]) + P["b_key"] # key strength _beta_t = T.dot(x, P["W_beta"]) + P["b_beta"] beta_t = T.nnet.softplus(_beta_t) # interpolation gate g_t = T.nnet.sigmoid(T.dot(x, P["W_g"]) + P["b_g"]) # shift shift_t = U.vector_softmax(T.dot(x, P["W_shift"]) + P["b_shift"]) shift_t.name = "shift_t" # sharpening _gamma_t = T.dot(x, P["W_gamma"]) + P["b_gamma"] gamma_t = T.nnet.softplus(_gamma_t) + 1. # erase and add vectors erase_t = T.nnet.sigmoid(T.dot(x, P["W_erase"]) + P["b_erase"]) add_t = T.dot(x, P["W_add"]) + P["b_add"] return key_t, beta_t, g_t, shift_t, gamma_t, erase_t, add_t
def sampler(temp, x, prev_cell_1, prev_hidden_1, prev_cell_2, prev_hidden_2): input_embedding = P.V[x] cell_1, hidden_1 = lstm_layer_1(input_embedding, prev_cell_1, prev_hidden_1) cell_2, hidden_2 = lstm_layer_2(hidden_1, prev_cell_2, prev_hidden_2) output = U.vector_softmax(temp * (T.dot(hidden_2, P.W_output) + P.b_output)) return output, cell_1, hidden_1, cell_2, hidden_2
def controller(input_t, read_t): # print "input_t",input_t.type new_input_t = (input_t + T.nnet.sigmoid(T.dot(read_t,P.W_read_hidden) + P.b_hidden_read))/2 if input_t.ndim > 1 : output_t = T.nnet.softmax( T.dot(new_input_t, P.W_input_hidden) + P.b_hidden_0 ) else : output_t = U.vector_softmax( T.dot(new_input_t, P.W_input_hidden) + P.b_hidden_0) # print "input",read_t.type,input_t.type # print "weights",P.W_input_hidden.type,P.W_read_hidden.type,P.b_hidden_0.type # print "layer", hidden_0.type return output_t
def controller(input_t, read_t): # print "input_t",input_t.type lstm_weight = 1-P.attention_weight weighted_sum = lstm_weight*input_t + P.attention_weight*read_t if input_t.ndim > 1 : output_t = T.nnet.softmax( T.dot(weighted_sum, P.W_input_hidden) + P.b_hidden_0 ) else : output_t = U.vector_softmax( T.dot(weighted_sum, P.W_input_hidden) + P.b_hidden_0) # print "input",read_t.type,input_t.type # print "weights",P.W_input_hidden.type,P.W_read_hidden.type,P.b_hidden_0.type # print "layer", hidden_0.type return output_t
def build_head_curr(weight_prev, M_curr, head, input_curr): """ This function is best described by Figure 2 in the paper. """ key, beta, g, shift, gamma, erase, add = head(input_curr) # 3.3.1 Focusing b Content weight_c = U.vector_softmax(beta * similarity(key, M_curr)) weight_c.name = "weight_c" # 3.3.2 Focusing by Location weight_g = g * weight_c + (1 - g) * weight_prev weight_g.name = "weight_g" weight_shifted = shift_convolve(weight_g, shift) weight_sharp = weight_shifted ** gamma weight_curr = weight_sharp / T.sum(weight_sharp) return weight_curr, erase, add
def build_head_curr(weight_prev,M_curr,head,input_curr): """ This function is best described by Figure 2 in the paper. """ key,beta,g,shift,gamma,erase,add = head(input_curr) # 3.3.1 Focusing b Content weight_c = U.vector_softmax(beta * similarity(key,M_curr)) weight_c.name = "weight_c" # 3.3.2 Focusing by Location weight_g = g * weight_c + (1 - g) * weight_prev weight_g.name = "weight_g" weight_shifted = shift_convolve(weight_g,shift) weight_sharp = weight_shifted ** gamma weight_curr = weight_sharp / T.sum(weight_sharp) return weight_curr,erase,add
def build_step(P,controller,controller_size,mem_size,mem_width,similarity=cosine_sim,shift_width=3,no_heads=1): shift_conv = scipy.linalg.circulant(np.arange(mem_size)).T[np.arange(-(shift_width//2),(shift_width//2)+1)][::-1] # shift_conv = scipy.linalg.circulant(np.arange(mem_size)).T[np.arange(-(shift_width//2),(shift_width//2)+1)][::-1][1:] P.memory_init = 2 * (np.random.rand(mem_size,mem_width) - 0.5) # U.initial_weights(mem_size,mem_width) # P.weight_init = np.random.randn(mem_size) #U.initial_weights(mem_size)# memory_init = P.memory_init weight_init = U.vector_softmax(P.weight_init) heads = [head.build(P,h,controller_size,mem_width,mem_size,shift_width) for h in range(no_heads)] def build_memory_curr(M_prev,erase_head,add_head,weight): weight = weight.dimshuffle((0,'x')) erase_head = erase_head.dimshuffle(('x',0)) add_head = add_head.dimshuffle(('x',0)) M_erased = M_prev * (1 - (weight * erase_head)) M_curr = M_erased + (weight * add_head) return M_curr def build_read(M_curr,weight_curr): return T.dot(weight_curr, M_curr) def shift_convolve(weight,shift): shift = shift.dimshuffle((0,'x')) # 3X100 return T.sum(shift * weight[shift_conv],axis=0) #return weight[shift*shift_conv[0] + (1-shift)*shift_conv[1]] # sh = shift #1 - (shift - T.floor(shift)) # return ( sh * weight[shift_conv[0]] + (1 - sh) * weight[shift_conv[1]] ) def build_head_curr(weight_prev,M_curr,head,input_curr): """ This function is best described by Figure 2 in the paper. """ key,beta,g,shift,gamma,erase,add = head(input_curr) # 3.3.1 Focusing b Content weight_c = U.vector_softmax(beta * similarity(key,M_curr)) weight_c.name = "weight_c" # 3.3.2 Focusing by Location weight_g = g * weight_c + (1 - g) * weight_prev weight_g.name = "weight_g" # 128 weight_shifted = shift_convolve(weight_g,shift) # gamma!!!!!! weight_sharp = weight_shifted ** gamma weight_curr = weight_sharp / (T.sum(weight_sharp)+ 1e-5) # weight_curr = weight_sharp / (T.sum(weight_sharp)) return weight_curr,erase,add def step(input_curr,M_prev,weight_prev): #print read_prev.type read_prev = build_read(M_prev,weight_prev) output,controller_hidden = controller(input_curr,read_prev) weight_inter,M_inter = weight_prev,M_prev for head in heads: weight_inter,erase,add = build_head_curr(weight_inter,M_inter,head,controller_hidden) M_inter = build_memory_curr(M_inter,erase,add,weight_inter) weight_curr,M_curr = weight_inter,M_inter #print [i.type for i in [erase_curr,add_curr,key_curr,shift_curr,beta_curr,gamma_curr,g_curr,output]] #print weight_curr.type return M_curr,weight_curr,output return step,[memory_init,weight_init,None]
def build_step(P,controller,controller_size,mem_size,mem_width,similarity=cosine_sim,shift_width=3,no_heads=1): shift_conv = scipy.linalg.circulant(np.arange(mem_size)).T[np.arange(-(shift_width//2),(shift_width//2)+1)][::-1] P.memory_init = 2 * (np.random.rand(mem_size,mem_width) - 0.5) P.weight_init = np.random.randn(mem_size) memory_init = P.memory_init weight_init = U.vector_softmax(P.weight_init) heads = [head.build(P,h,controller_size,mem_width,mem_size,shift_width) for h in range(no_heads)] def build_memory_curr(M_prev,erase_head,add_head,weight): weight = weight.dimshuffle((0,'x')) erase_head = erase_head.dimshuffle(('x',0)) add_head = add_head.dimshuffle(('x',0)) M_erased = M_prev * (1 - (weight * erase_head)) M_curr = M_erased + (weight * add_head) return M_curr def build_read(M_curr,weight_curr): return T.dot(weight_curr, M_curr) def shift_convolve(weight,shift): shift = shift.dimshuffle((0,'x')) return T.sum(shift * weight[shift_conv],axis=0) def build_head_curr(weight_prev,M_curr,head,input_curr): """ This function is best described by Figure 2 in the paper. """ key,beta,g,shift,gamma,erase,add = head(input_curr) # 3.3.1 Focusing b Content weight_c = U.vector_softmax(beta * similarity(key,M_curr)) weight_c.name = "weight_c" # 3.3.2 Focusing by Location weight_g = g * weight_c + (1 - g) * weight_prev weight_g.name = "weight_g" weight_shifted = shift_convolve(weight_g,shift) weight_sharp = weight_shifted ** gamma weight_curr = weight_sharp / T.sum(weight_sharp) return weight_curr,erase,add def step(input_curr,M_prev,weight_prev): #print read_prev.type read_prev = build_read(M_prev,weight_prev) output,controller_hidden = controller(input_curr,read_prev) weight_inter,M_inter = weight_prev,M_prev for head in heads: weight_inter,erase,add = build_head_curr(weight_inter,M_inter,head,controller_hidden) M_inter = build_memory_curr(M_inter,erase,add,weight_inter) weight_curr,M_curr = weight_inter,M_inter #print [i.type for i in [erase_curr,add_curr,key_curr,shift_curr,beta_curr,gamma_curr,g_curr,output]] #print weight_curr.type return M_curr,weight_curr,output return step,[memory_init,weight_init,None]
def build_step(P, controller, controller_size, mem_size, mem_width, similarity=cosine_sim, shift_width=3): # Set of shift indices (for shift_width=3, have shift offsets of -1, 0, and +1) shift_conv = scipy.linalg.circulant(np.arange(mem_size)).T[np.arange( -(shift_width // 2), (shift_width // 2) + 1)][::-1] # Initial N X M memory: M_0 P.memory_init = 2 * (np.random.rand(mem_size, mem_width) - 0.5) memory_init = P.memory_init # Initial N-dim weight vector: w_0 P.weight_init = np.random.randn(mem_size) weight_init = U.vector_softmax(P.weight_init) # heads is a function taking the hidden layer of the controller and # computes the key, key strength, interpolation gate, # sharpening factor, and erase and add vectors as outputs heads = head.build(P, controller_size, mem_width, mem_size, shift_width) def build_memory_curr(M_prev, erase_head, add_head, weight): """ Update memory with write consisting of erase and add (described in section 3.2 in paper) """ weight = weight.dimshuffle((0, 'x')) erase_head = erase_head.dimshuffle(('x', 0)) add_head = add_head.dimshuffle(('x', 0)) # Equation (3) M_erased = M_prev * (1 - (weight * erase_head)) # Equation (4) M_curr = M_erased + (weight * add_head) return M_curr def build_read(M_curr, weight_curr): """ Obtain read vector r_t (Equation (2) in paper) """ return T.dot(weight_curr, M_curr) def shift_convolve(weight, shift): """ Circular convolution (Equation (8) in paper) """ shift = shift.dimshuffle((0, 'x')) return T.sum(shift * weight[shift_conv], axis=0) def build_head_curr(weight_prev, M_curr, head, input_curr): """ Implement addressing mechanism shown in Figure 2 in paper. Also return add and erase vectors computed by head. """ # input_curr is hidden layer from controller # this is passing the hidden layer into the heads layer # which computes key, beta, g, shift, gamma, erase, and add # as outputs (see head_params in head.py) key, beta, g, shift, gamma, erase, add = head(input_curr) # 3.3.1 Focusing b Content (Equation (5)) weight_c = U.vector_softmax(beta * similarity(key, M_curr)) weight_c.name = "weight_c" # 3.3.2 Focusing by Location (Equation (7)) weight_g = g * weight_c + (1 - g) * weight_prev weight_g.name = "weight_g" # Equation (8) weight_shifted = shift_convolve(weight_g, shift) # Equation (9) weight_sharp = weight_shifted**gamma weight_curr = weight_sharp / T.sum(weight_sharp) return weight_curr, erase, add def step(input_curr, M_prev, weight_prev): """ Update the weights and memory from the previous time step given the current input """ # Get read vector r_t read_prev = build_read(M_prev, weight_prev) # Feed current input and read input to controller to get # controller output and hidden layer of controller output, controller_hidden = controller(input_curr, read_prev) # Obtain new weight vector (as described in figure 2) and erase and add vectors weight_curr, erase, add = build_head_curr(weight_prev, M_prev, heads, controller_hidden) # Update memory with current weight, erase, and add vectors (Section 3.2 in paper) M_curr = build_memory_curr(M_prev, erase, add, weight_curr) return M_curr, weight_curr, output return step, [memory_init, weight_init, None]
def build_step(P, controller, controller_size, mem_size, mem_width, similarity=cosine_sim, shift_width=3, no_heads=1): shift_conv = scipy.linalg.circulant(np.arange(mem_size)).T[np.arange(-(shift_width // 2), (shift_width // 2) + 1)][ ::-1 ] P.memory_init = 2 * (np.random.rand(mem_size, mem_width) - 0.5) P.weight_init = np.random.randn(mem_size) memory_init = P.memory_init weight_init = U.vector_softmax(P.weight_init) heads = [head.build(P, h, controller_size, mem_width, mem_size, shift_width) for h in range(no_heads)] def build_memory_curr(M_prev, erase_head, add_head, weight): weight = weight.dimshuffle((0, "x")) erase_head = erase_head.dimshuffle(("x", 0)) add_head = add_head.dimshuffle(("x", 0)) M_erased = M_prev * (1 - (weight * erase_head)) M_curr = M_erased + (weight * add_head) return M_curr def build_read(M_curr, weight_curr): return T.dot(weight_curr, M_curr) def shift_convolve(weight, shift): shift = shift.dimshuffle((0, "x")) return T.sum(shift * weight[shift_conv], axis=0) def build_head_curr(weight_prev, M_curr, head, input_curr): """ This function is best described by Figure 2 in the paper. """ key, beta, g, shift, gamma, erase, add = head(input_curr) # 3.3.1 Focusing b Content weight_c = U.vector_softmax(beta * similarity(key, M_curr)) weight_c.name = "weight_c" # 3.3.2 Focusing by Location weight_g = g * weight_c + (1 - g) * weight_prev weight_g.name = "weight_g" weight_shifted = shift_convolve(weight_g, shift) weight_sharp = weight_shifted ** gamma weight_curr = weight_sharp / T.sum(weight_sharp) return weight_curr, erase, add def step(input_curr, M_prev, weight_prev): read_prev = build_read(M_prev, weight_prev) output, controller_hidden = controller(input_curr, read_prev) weight_inter, M_inter = weight_prev, M_prev for head in heads: weight_inter, erase, add = build_head_curr(weight_inter, M_inter, head, controller_hidden) M_inter = build_memory_curr(M_inter, erase, add, weight_inter) weight_curr, M_curr = weight_inter, M_inter return M_curr, weight_curr, output return step, [memory_init, weight_init, None]
def build_step(P,controller,controller_size,mem_size,mem_width,similarity=cosine_sim,shift_width=3): # Set of shift indices (for shift_width=3, have shift offsets of -1, 0, and +1) shift_conv = scipy.linalg.circulant(np.arange(mem_size)).T[np.arange(-(shift_width//2),(shift_width//2)+1)][::-1] # Initial N X M memory: M_0 P.memory_init = 2 * (np.random.rand(mem_size,mem_width) - 0.5) memory_init = P.memory_init # Initial N-dim weight vector: w_0 P.weight_init = np.random.randn(mem_size) weight_init = U.vector_softmax(P.weight_init) # heads is a function taking the hidden layer of the controller and # computes the key, key strength, interpolation gate, # sharpening factor, and erase and add vectors as outputs heads = head.build(P,controller_size,mem_width,mem_size,shift_width) def build_memory_curr(M_prev,erase_head,add_head,weight): """ Update memory with write consisting of erase and add (described in section 3.2 in paper) """ weight = weight.dimshuffle((0,'x')) erase_head = erase_head.dimshuffle(('x',0)) add_head = add_head.dimshuffle(('x',0)) # Equation (3) M_erased = M_prev * (1 - (weight * erase_head)) # Equation (4) M_curr = M_erased + (weight * add_head) return M_curr def build_read(M_curr,weight_curr): """ Obtain read vector r_t (Equation (2) in paper) """ return T.dot(weight_curr, M_curr) def shift_convolve(weight,shift): """ Circular convolution (Equation (8) in paper) """ shift = shift.dimshuffle((0,'x')) return T.sum(shift * weight[shift_conv],axis=0) def build_head_curr(weight_prev,M_curr,head,input_curr): """ Implement addressing mechanism shown in Figure 2 in paper. Also return add and erase vectors computed by head. """ # input_curr is hidden layer from controller # this is passing the hidden layer into the heads layer # which computes key, beta, g, shift, gamma, erase, and add # as outputs (see head_params in head.py) key,beta,g,shift,gamma,erase,add = head(input_curr) # 3.3.1 Focusing b Content (Equation (5)) weight_c = U.vector_softmax(beta * similarity(key,M_curr)) weight_c.name = "weight_c" # 3.3.2 Focusing by Location (Equation (7)) weight_g = g * weight_c + (1 - g) * weight_prev weight_g.name = "weight_g" # Equation (8) weight_shifted = shift_convolve(weight_g,shift) # Equation (9) weight_sharp = weight_shifted ** gamma weight_curr = weight_sharp / T.sum(weight_sharp) return weight_curr,erase,add def step(input_curr,M_prev,weight_prev): """ Update the weights and memory from the previous time step given the current input """ # Get read vector r_t read_prev = build_read(M_prev,weight_prev) # Feed current input and read input to controller to get # controller output and hidden layer of controller output,controller_hidden = controller(input_curr,read_prev) # Obtain new weight vector (as described in figure 2) and erase and add vectors weight_curr,erase,add = build_head_curr(weight_prev,M_prev,heads,controller_hidden) # Update memory with current weight, erase, and add vectors (Section 3.2 in paper) M_curr = build_memory_curr(M_prev,erase,add,weight_curr) return M_curr,weight_curr,output return step,[memory_init,weight_init,None]