Ejemplo n.º 1
0
    def head_params(x):
        # key
        key_t = T.dot(x, P["W_%d_key" % id]) + P["b_%d_key" % id]

        # shift
        shift_t = U.vector_softmax(
            T.dot(x, P["W_%d_shift" % id]) + P["b_%d_shift" % id])
        shift_t.name = "shift_t"

        # scalars
        _beta_t = T.dot(x, P["W_%d_beta" % id]) + P["b_%d_beta" % id]
        _gamma_t = T.dot(x, P["W_%d_gamma" % id]) + P["b_%d_gamma" % id]

        beta_t = T.nnet.softplus(_beta_t)
        gamma_t = T.nnet.softplus(_gamma_t) + 1.
        #		beta_t  = (_beta_t  > 0)*_beta_t
        #		gamma_t = (_gamma_t > 0)*_gamma_t + 1.
        #		beta_t  = T.exp(_beta_t)
        #		gamma_t = T.exp(_gamma_t) + 1.

        g_t = T.nnet.sigmoid(T.dot(x, P["W_%d_g" % id]) + P["b_%d_g" % id])

        erase_t = T.nnet.sigmoid(
            T.dot(x, P["W_%d_erase" % id]) + P["b_%d_erase" % id])
        add_t = T.dot(x, P["W_%d_add" % id]) + P["b_%d_add" % id]

        return key_t, beta_t, g_t, shift_t, gamma_t, erase_t, add_t
Ejemplo n.º 2
0
    def head_params(x):
        # key
        key_t = T.dot(x, P["W_%d_key" % id]) + P["b_%d_key" % id]

        # shift
        shift_t = U.vector_softmax(
            T.dot(x, P["W_%d_shift" % id]) + P["b_%d_shift" % id])
        shift_t.name = "shift_t"

        # scalars
        _beta_t = T.dot(x, P["W_%d_beta" % id]) + P["b_%d_beta" % id]
        _gamma_t = T.dot(x, P["W_%d_gamma" % id]) + P["b_%d_gamma" % id]

        beta_t = T.nnet.softplus(_beta_t)
        gamma_t = T.nnet.softplus(_gamma_t) + 1.
#		beta_t  = (_beta_t  > 0)*_beta_t
#		gamma_t = (_gamma_t > 0)*_gamma_t + 1.
#		beta_t  = T.exp(_beta_t)
#		gamma_t = T.exp(_gamma_t) + 1.

        g_t = T.nnet.sigmoid(T.dot(x, P["W_%d_g" % id]) + P["b_%d_g" % id])

        erase_t = T.nnet.sigmoid(
            T.dot(x, P["W_%d_erase" % id]) + P["b_%d_erase" % id])
        add_t = T.dot(x, P["W_%d_add" % id]) + P["b_%d_add" % id]

        return key_t, beta_t, g_t, shift_t, gamma_t, erase_t, add_t
Ejemplo n.º 3
0
    def build_head_curr(weight_prev, M_curr, head, input_curr):
        """
                Implement addressing mechanism shown in Figure 2 in paper.
                Also return add and erase vectors computed by head.
                """
        # input_curr is hidden layer from controller
        # this is passing the hidden layer into the heads layer
        # which computes key, beta, g, shift, gamma, erase, and add
        # as outputs (see head_params in head.py)
        key, beta, g, shift, gamma, erase, add = head(input_curr)

        # 3.3.1 Focusing b Content (Equation (5))
        weight_c = U.vector_softmax(beta * similarity(key, M_curr))
        weight_c.name = "weight_c"

        # 3.3.2 Focusing by Location (Equation (7))
        weight_g = g * weight_c + (1 - g) * weight_prev
        weight_g.name = "weight_g"

        # Equation (8)
        weight_shifted = shift_convolve(weight_g, shift)

        # Equation (9)
        weight_sharp = weight_shifted**gamma
        weight_curr = weight_sharp / T.sum(weight_sharp)

        return weight_curr, erase, add
Ejemplo n.º 4
0
	def head_params(x):
                """
                Takes hidden layer from controller computes
                k_t, beta_t, g_t, s_t,  and erase and add
                vectors as outputs
                """
		# key
		key_t = T.dot(x,P["W_key"]) + P["b_key"]

                # key strength
		_beta_t  = T.dot(x,P["W_beta"])  + P["b_beta"]
		beta_t  = T.nnet.softplus(_beta_t)

                # interpolation gate
		g_t     = T.nnet.sigmoid(T.dot(x,P["W_g"]) + P["b_g"])

		# shift
		shift_t = U.vector_softmax(T.dot(x,P["W_shift"]) + P["b_shift"])
		shift_t.name = "shift_t"

                # sharpening
                _gamma_t = T.dot(x,P["W_gamma"]) + P["b_gamma"]
		gamma_t = T.nnet.softplus(_gamma_t) + 1.

                # erase and add vectors
		erase_t = T.nnet.sigmoid(T.dot(x,P["W_erase"]) + P["b_erase"])
		add_t   = T.dot(x,P["W_add"]) + P["b_add"]

		return key_t,beta_t,g_t,shift_t,gamma_t,erase_t,add_t
Ejemplo n.º 5
0
	def qa(story,idxs,qstn):
		word_feats    = V[story]
		qn_word_feats = V[qstn]

		diag_cells,diag_hiddens = encode_diag(word_feats,idxs)
		qn_cell,qn_hidden = encode_qstn(qn_word_feats)
		
		lookup = lookup_prep(diag_hiddens)

		attention = [None] * evidence_count
		evidence  = [None] * evidence_count


		prev_cell,prev_hidden = qn_cell,qn_hidden
		prev_attn = 0
		alpha = 0.0
		input_vec = T.mean(diag_cells,axis=0)
		for i in xrange(evidence_count): 
			prev_cell, prev_hidden = qn2keys(input_vec,prev_cell,prev_hidden)
			attention[i] = lookup(prev_hidden,prev_attn)
			attention[i].name = "attention_%d"%i
			evidence[i] = input_vec = T.sum(attention[i].dimshuffle(0,'x') * diag_cells,axis=0)
								#	alpha * T.mean(diag_vectors,axis=0)
			prev_attn = prev_attn + attention[i]
		final_cell, final_hidden = prev_cell,prev_hidden

		output = U.vector_softmax(T.dot(final_hidden,P.W_output_vocab) + P.b_output_vocab)
		return attention,output
Ejemplo n.º 6
0
	def build_head_curr(weight_prev,M_curr,head,input_curr):
                """
                Implement addressing mechanism shown in Figure 2 in paper.
                Also return add and erase vectors computed by head.
                """
                # input_curr is hidden layer from controller
                # this is passing the hidden layer into the heads layer
                # which computes key, beta, g, shift, gamma, erase, and add
                # as outputs (see head_params in head.py)
		key,beta,g,shift,gamma,erase,add = head(input_curr)

		# 3.3.1 Focusing b Content (Equation (5))
		weight_c = U.vector_softmax(beta * similarity(key,M_curr))
		weight_c.name = "weight_c"

		# 3.3.2 Focusing by Location (Equation (7))
		weight_g       = g * weight_c + (1 - g) * weight_prev
		weight_g.name = "weight_g"

                # Equation (8)
		weight_shifted = shift_convolve(weight_g,shift)

                # Equation (9)
		weight_sharp   = weight_shifted ** gamma
		weight_curr    = weight_sharp / T.sum(weight_sharp)

		return weight_curr,erase,add
Ejemplo n.º 7
0
    def qa(story, idxs, qstn):
        word_feats = V[story]
        qn_word_feats = V[qstn]

        diag_cells, diag_hiddens = encode_diag(word_feats, idxs)
        qn_cell, qn_hidden = encode_qstn(qn_word_feats)

        lookup = lookup_prep(diag_hiddens)

        attention = [None] * evidence_count
        evidence = [None] * evidence_count

        prev_cell, prev_hidden = qn_cell, qn_hidden
        prev_attn = 0
        alpha = 0.0
        input_vec = T.mean(diag_cells, axis=0)
        for i in xrange(evidence_count):
            prev_cell, prev_hidden = qn2keys(input_vec, prev_cell, prev_hidden)
            attention[i] = lookup(prev_hidden, prev_attn)
            attention[i].name = "attention_%d" % i
            evidence[i] = input_vec = T.sum(attention[i].dimshuffle(0, 'x') *
                                            diag_cells,
                                            axis=0)
            #	alpha * T.mean(diag_vectors,axis=0)
            prev_attn = prev_attn + attention[i]
        final_cell, final_hidden = prev_cell, prev_hidden

        output = U.vector_softmax(
            T.dot(final_hidden, P.W_output_vocab) + P.b_output_vocab)
        return attention, output
Ejemplo n.º 8
0
    def head_params(x):
        """
                Takes hidden layer from controller computes
                k_t, beta_t, g_t, s_t,  and erase and add
                vectors as outputs
                """
        # key
        key_t = T.dot(x, P["W_key"]) + P["b_key"]

        # key strength
        _beta_t = T.dot(x, P["W_beta"]) + P["b_beta"]
        beta_t = T.nnet.softplus(_beta_t)

        # interpolation gate
        g_t = T.nnet.sigmoid(T.dot(x, P["W_g"]) + P["b_g"])

        # shift
        shift_t = U.vector_softmax(T.dot(x, P["W_shift"]) + P["b_shift"])
        shift_t.name = "shift_t"

        # sharpening
        _gamma_t = T.dot(x, P["W_gamma"]) + P["b_gamma"]
        gamma_t = T.nnet.softplus(_gamma_t) + 1.

        # erase and add vectors
        erase_t = T.nnet.sigmoid(T.dot(x, P["W_erase"]) + P["b_erase"])
        add_t = T.dot(x, P["W_add"]) + P["b_add"]

        return key_t, beta_t, g_t, shift_t, gamma_t, erase_t, add_t
Ejemplo n.º 9
0
 def sampler(temp, x, prev_cell_1, prev_hidden_1, prev_cell_2,
             prev_hidden_2):
     input_embedding = P.V[x]
     cell_1, hidden_1 = lstm_layer_1(input_embedding, prev_cell_1,
                                     prev_hidden_1)
     cell_2, hidden_2 = lstm_layer_2(hidden_1, prev_cell_2, prev_hidden_2)
     output = U.vector_softmax(temp *
                               (T.dot(hidden_2, P.W_output) + P.b_output))
     return output, cell_1, hidden_1, cell_2, hidden_2
Ejemplo n.º 10
0
    def controller(input_t, read_t):
        #		print "input_t",input_t.type
        new_input_t = (input_t + T.nnet.sigmoid(T.dot(read_t,P.W_read_hidden) + P.b_hidden_read))/2

        if input_t.ndim > 1 :
            output_t = T.nnet.softmax(
                T.dot(new_input_t, P.W_input_hidden) +
                P.b_hidden_0
            )
        else :
            output_t = U.vector_softmax(
                T.dot(new_input_t, P.W_input_hidden) +
                P.b_hidden_0)

#		print "input",read_t.type,input_t.type
#		print "weights",P.W_input_hidden.type,P.W_read_hidden.type,P.b_hidden_0.type
#		print "layer", hidden_0.type

        return output_t
Ejemplo n.º 11
0
    def controller(input_t, read_t):
        #		print "input_t",input_t.type
        lstm_weight = 1-P.attention_weight
        weighted_sum = lstm_weight*input_t + P.attention_weight*read_t

        if input_t.ndim > 1 :
            output_t = T.nnet.softmax(
                T.dot(weighted_sum, P.W_input_hidden) +
                P.b_hidden_0
            )
        else :
            output_t = U.vector_softmax(
                T.dot(weighted_sum, P.W_input_hidden) +
                P.b_hidden_0)

#		print "input",read_t.type,input_t.type
#		print "weights",P.W_input_hidden.type,P.W_read_hidden.type,P.b_hidden_0.type
#		print "layer", hidden_0.type

        return output_t
Ejemplo n.º 12
0
    def build_head_curr(weight_prev, M_curr, head, input_curr):
        """
        This function is best described by Figure 2 in the paper.
        """
        key, beta, g, shift, gamma, erase, add = head(input_curr)

        # 3.3.1 Focusing b Content
        weight_c = U.vector_softmax(beta * similarity(key, M_curr))
        weight_c.name = "weight_c"

        # 3.3.2 Focusing by Location
        weight_g = g * weight_c + (1 - g) * weight_prev
        weight_g.name = "weight_g"

        weight_shifted = shift_convolve(weight_g, shift)

        weight_sharp = weight_shifted ** gamma
        weight_curr = weight_sharp / T.sum(weight_sharp)

        return weight_curr, erase, add
Ejemplo n.º 13
0
	def build_head_curr(weight_prev,M_curr,head,input_curr):
		"""
		This function is best described by Figure 2 in the paper.
		"""
		key,beta,g,shift,gamma,erase,add = head(input_curr)

		# 3.3.1 Focusing b Content
		weight_c = U.vector_softmax(beta * similarity(key,M_curr))
		weight_c.name = "weight_c"

		# 3.3.2 Focusing by Location
		weight_g       = g * weight_c + (1 - g) * weight_prev
		weight_g.name = "weight_g"

		weight_shifted = shift_convolve(weight_g,shift)

		weight_sharp   = weight_shifted ** gamma
		weight_curr    = weight_sharp / T.sum(weight_sharp)

		return weight_curr,erase,add
Ejemplo n.º 14
0
Archivo: model.py Proyecto: c3h3/pyntm
def build_step(P,controller,controller_size,mem_size,mem_width,similarity=cosine_sim,shift_width=3,no_heads=1):

    shift_conv = scipy.linalg.circulant(np.arange(mem_size)).T[np.arange(-(shift_width//2),(shift_width//2)+1)][::-1]
#     shift_conv = scipy.linalg.circulant(np.arange(mem_size)).T[np.arange(-(shift_width//2),(shift_width//2)+1)][::-1][1:]

    P.memory_init = 2 * (np.random.rand(mem_size,mem_width) - 0.5) # U.initial_weights(mem_size,mem_width) # 
    P.weight_init = np.random.randn(mem_size) #U.initial_weights(mem_size)# 

    memory_init = P.memory_init
    weight_init = U.vector_softmax(P.weight_init)

    heads = [head.build(P,h,controller_size,mem_width,mem_size,shift_width) for h in range(no_heads)]

    def build_memory_curr(M_prev,erase_head,add_head,weight):
        weight = weight.dimshuffle((0,'x'))

        erase_head = erase_head.dimshuffle(('x',0))
        add_head   = add_head.dimshuffle(('x',0))

        M_erased = M_prev   * (1 - (weight * erase_head))
        M_curr   = M_erased +      (weight * add_head)
        return M_curr

    def build_read(M_curr,weight_curr):
        return T.dot(weight_curr, M_curr)

    def shift_convolve(weight,shift):
        shift = shift.dimshuffle((0,'x')) # 3X100
        return T.sum(shift * weight[shift_conv],axis=0)
    
        #return weight[shift*shift_conv[0] + (1-shift)*shift_conv[1]]
#         sh = shift #1 - (shift - T.floor(shift))
#         return ( sh * weight[shift_conv[0]] + (1 - sh) * weight[shift_conv[1]] )


    def build_head_curr(weight_prev,M_curr,head,input_curr):
        """
        This function is best described by Figure 2 in the paper.
        """
        key,beta,g,shift,gamma,erase,add = head(input_curr)

        # 3.3.1 Focusing b Content
        weight_c = U.vector_softmax(beta * similarity(key,M_curr))
        weight_c.name = "weight_c"

        # 3.3.2 Focusing by Location
        weight_g       = g * weight_c + (1 - g) * weight_prev
        weight_g.name = "weight_g" # 128
        
        weight_shifted = shift_convolve(weight_g,shift)

        # gamma!!!!!! 
        weight_sharp   = weight_shifted ** gamma
        weight_curr    = weight_sharp / (T.sum(weight_sharp)+ 1e-5)
#         weight_curr    = weight_sharp / (T.sum(weight_sharp))

        return weight_curr,erase,add

    def step(input_curr,M_prev,weight_prev):
        #print read_prev.type

        read_prev = build_read(M_prev,weight_prev)
        output,controller_hidden = controller(input_curr,read_prev)

        weight_inter,M_inter = weight_prev,M_prev
        for head in heads:
            weight_inter,erase,add = build_head_curr(weight_inter,M_inter,head,controller_hidden)
            M_inter = build_memory_curr(M_inter,erase,add,weight_inter)
        weight_curr,M_curr = weight_inter,M_inter

        #print [i.type for i in [erase_curr,add_curr,key_curr,shift_curr,beta_curr,gamma_curr,g_curr,output]]
        #print weight_curr.type
        return M_curr,weight_curr,output
    return step,[memory_init,weight_init,None]
Ejemplo n.º 15
0
def build_step(P,controller,controller_size,mem_size,mem_width,similarity=cosine_sim,shift_width=3,no_heads=1):
	
	shift_conv = scipy.linalg.circulant(np.arange(mem_size)).T[np.arange(-(shift_width//2),(shift_width//2)+1)][::-1]

	P.memory_init = 2 * (np.random.rand(mem_size,mem_width) - 0.5)
	P.weight_init = np.random.randn(mem_size)

	memory_init = P.memory_init
	weight_init = U.vector_softmax(P.weight_init)

	heads = [head.build(P,h,controller_size,mem_width,mem_size,shift_width) for h in range(no_heads)]
	
	def build_memory_curr(M_prev,erase_head,add_head,weight):
		weight = weight.dimshuffle((0,'x'))

		erase_head = erase_head.dimshuffle(('x',0))
		add_head   = add_head.dimshuffle(('x',0))

		M_erased = M_prev   * (1 - (weight * erase_head))
		M_curr   = M_erased +      (weight * add_head)
		return M_curr
	
	def build_read(M_curr,weight_curr):
		return T.dot(weight_curr, M_curr)

	def shift_convolve(weight,shift):
		shift = shift.dimshuffle((0,'x'))
		return T.sum(shift * weight[shift_conv],axis=0)


	def build_head_curr(weight_prev,M_curr,head,input_curr):
		"""
		This function is best described by Figure 2 in the paper.
		"""
		key,beta,g,shift,gamma,erase,add = head(input_curr)

		# 3.3.1 Focusing b Content
		weight_c = U.vector_softmax(beta * similarity(key,M_curr))
		weight_c.name = "weight_c"

		# 3.3.2 Focusing by Location
		weight_g       = g * weight_c + (1 - g) * weight_prev
		weight_g.name = "weight_g"

		weight_shifted = shift_convolve(weight_g,shift)

		weight_sharp   = weight_shifted ** gamma
		weight_curr    = weight_sharp / T.sum(weight_sharp)

		return weight_curr,erase,add
	
	def step(input_curr,M_prev,weight_prev):
		#print read_prev.type
		
		read_prev = build_read(M_prev,weight_prev)
		output,controller_hidden = controller(input_curr,read_prev)

		weight_inter,M_inter = weight_prev,M_prev
		for head in heads:
			weight_inter,erase,add = build_head_curr(weight_inter,M_inter,head,controller_hidden)
			M_inter = build_memory_curr(M_inter,erase,add,weight_inter)
		weight_curr,M_curr = weight_inter,M_inter
		
		#print [i.type for i in [erase_curr,add_curr,key_curr,shift_curr,beta_curr,gamma_curr,g_curr,output]]
		#print weight_curr.type
		return M_curr,weight_curr,output
	return step,[memory_init,weight_init,None]
Ejemplo n.º 16
0
def build_step(P,
               controller,
               controller_size,
               mem_size,
               mem_width,
               similarity=cosine_sim,
               shift_width=3):
    # Set of shift indices (for shift_width=3, have shift offsets of -1, 0, and +1)
    shift_conv = scipy.linalg.circulant(np.arange(mem_size)).T[np.arange(
        -(shift_width // 2), (shift_width // 2) + 1)][::-1]

    # Initial N X M memory: M_0
    P.memory_init = 2 * (np.random.rand(mem_size, mem_width) - 0.5)
    memory_init = P.memory_init

    # Initial N-dim weight vector: w_0
    P.weight_init = np.random.randn(mem_size)
    weight_init = U.vector_softmax(P.weight_init)

    # heads is a function taking the hidden layer of the controller and
    # computes the key, key strength, interpolation gate,
    # sharpening factor, and erase and add vectors as outputs
    heads = head.build(P, controller_size, mem_width, mem_size, shift_width)

    def build_memory_curr(M_prev, erase_head, add_head, weight):
        """
                Update memory with write consisting of erase and add
                (described in section 3.2 in paper)
                """
        weight = weight.dimshuffle((0, 'x'))

        erase_head = erase_head.dimshuffle(('x', 0))
        add_head = add_head.dimshuffle(('x', 0))

        # Equation (3)
        M_erased = M_prev * (1 - (weight * erase_head))
        # Equation (4)
        M_curr = M_erased + (weight * add_head)

        return M_curr

    def build_read(M_curr, weight_curr):
        """
                Obtain read vector r_t (Equation (2) in paper)
                """
        return T.dot(weight_curr, M_curr)

    def shift_convolve(weight, shift):
        """
                Circular convolution (Equation (8) in paper)
                """
        shift = shift.dimshuffle((0, 'x'))
        return T.sum(shift * weight[shift_conv], axis=0)

    def build_head_curr(weight_prev, M_curr, head, input_curr):
        """
                Implement addressing mechanism shown in Figure 2 in paper.
                Also return add and erase vectors computed by head.
                """
        # input_curr is hidden layer from controller
        # this is passing the hidden layer into the heads layer
        # which computes key, beta, g, shift, gamma, erase, and add
        # as outputs (see head_params in head.py)
        key, beta, g, shift, gamma, erase, add = head(input_curr)

        # 3.3.1 Focusing b Content (Equation (5))
        weight_c = U.vector_softmax(beta * similarity(key, M_curr))
        weight_c.name = "weight_c"

        # 3.3.2 Focusing by Location (Equation (7))
        weight_g = g * weight_c + (1 - g) * weight_prev
        weight_g.name = "weight_g"

        # Equation (8)
        weight_shifted = shift_convolve(weight_g, shift)

        # Equation (9)
        weight_sharp = weight_shifted**gamma
        weight_curr = weight_sharp / T.sum(weight_sharp)

        return weight_curr, erase, add

    def step(input_curr, M_prev, weight_prev):
        """
                Update the weights and memory from the previous time step
                given the current input
                """
        # Get read vector r_t
        read_prev = build_read(M_prev, weight_prev)

        # Feed current input and read input to controller to get
        # controller output and hidden layer of controller
        output, controller_hidden = controller(input_curr, read_prev)

        # Obtain new weight vector (as described in figure 2) and erase and add vectors
        weight_curr, erase, add = build_head_curr(weight_prev, M_prev, heads,
                                                  controller_hidden)
        # Update memory with current weight, erase, and add vectors (Section 3.2 in paper)
        M_curr = build_memory_curr(M_prev, erase, add, weight_curr)

        return M_curr, weight_curr, output

    return step, [memory_init, weight_init, None]
Ejemplo n.º 17
0
 def sampler(temp, x, prev_cell_1, prev_hidden_1, prev_cell_2, prev_hidden_2):
     input_embedding = P.V[x]
     cell_1, hidden_1 = lstm_layer_1(input_embedding, prev_cell_1, prev_hidden_1)
     cell_2, hidden_2 = lstm_layer_2(hidden_1, prev_cell_2, prev_hidden_2)
     output = U.vector_softmax(temp * (T.dot(hidden_2, P.W_output) + P.b_output))
     return output, cell_1, hidden_1, cell_2, hidden_2
Ejemplo n.º 18
0
def build_step(P, controller, controller_size, mem_size, mem_width, similarity=cosine_sim, shift_width=3, no_heads=1):
    shift_conv = scipy.linalg.circulant(np.arange(mem_size)).T[np.arange(-(shift_width // 2), (shift_width // 2) + 1)][
        ::-1
    ]

    P.memory_init = 2 * (np.random.rand(mem_size, mem_width) - 0.5)
    P.weight_init = np.random.randn(mem_size)

    memory_init = P.memory_init
    weight_init = U.vector_softmax(P.weight_init)

    heads = [head.build(P, h, controller_size, mem_width, mem_size, shift_width) for h in range(no_heads)]

    def build_memory_curr(M_prev, erase_head, add_head, weight):
        weight = weight.dimshuffle((0, "x"))

        erase_head = erase_head.dimshuffle(("x", 0))
        add_head = add_head.dimshuffle(("x", 0))

        M_erased = M_prev * (1 - (weight * erase_head))
        M_curr = M_erased + (weight * add_head)
        return M_curr

    def build_read(M_curr, weight_curr):
        return T.dot(weight_curr, M_curr)

    def shift_convolve(weight, shift):
        shift = shift.dimshuffle((0, "x"))
        return T.sum(shift * weight[shift_conv], axis=0)

    def build_head_curr(weight_prev, M_curr, head, input_curr):
        """
        This function is best described by Figure 2 in the paper.
        """
        key, beta, g, shift, gamma, erase, add = head(input_curr)

        # 3.3.1 Focusing b Content
        weight_c = U.vector_softmax(beta * similarity(key, M_curr))
        weight_c.name = "weight_c"

        # 3.3.2 Focusing by Location
        weight_g = g * weight_c + (1 - g) * weight_prev
        weight_g.name = "weight_g"

        weight_shifted = shift_convolve(weight_g, shift)

        weight_sharp = weight_shifted ** gamma
        weight_curr = weight_sharp / T.sum(weight_sharp)

        return weight_curr, erase, add

    def step(input_curr, M_prev, weight_prev):
        read_prev = build_read(M_prev, weight_prev)
        output, controller_hidden = controller(input_curr, read_prev)
        weight_inter, M_inter = weight_prev, M_prev
        for head in heads:
            weight_inter, erase, add = build_head_curr(weight_inter, M_inter, head, controller_hidden)
            M_inter = build_memory_curr(M_inter, erase, add, weight_inter)
        weight_curr, M_curr = weight_inter, M_inter
        return M_curr, weight_curr, output

    return step, [memory_init, weight_init, None]
Ejemplo n.º 19
0
def build_step(P,controller,controller_size,mem_size,mem_width,similarity=cosine_sim,shift_width=3):
        # Set of shift indices (for shift_width=3, have shift offsets of -1, 0, and +1)
	shift_conv = scipy.linalg.circulant(np.arange(mem_size)).T[np.arange(-(shift_width//2),(shift_width//2)+1)][::-1]

        # Initial N X M memory: M_0
	P.memory_init = 2 * (np.random.rand(mem_size,mem_width) - 0.5)
	memory_init = P.memory_init

        # Initial N-dim weight vector: w_0
	P.weight_init = np.random.randn(mem_size)
	weight_init = U.vector_softmax(P.weight_init)

        # heads is a function taking the hidden layer of the controller and
        # computes the key, key strength, interpolation gate,
        # sharpening factor, and erase and add vectors as outputs
        heads = head.build(P,controller_size,mem_width,mem_size,shift_width)

	def build_memory_curr(M_prev,erase_head,add_head,weight):
                """
                Update memory with write consisting of erase and add
                (described in section 3.2 in paper)
                """
		weight = weight.dimshuffle((0,'x'))

		erase_head = erase_head.dimshuffle(('x',0))
		add_head   = add_head.dimshuffle(('x',0))

                # Equation (3)
		M_erased = M_prev   * (1 - (weight * erase_head))
                # Equation (4)
		M_curr   = M_erased +      (weight * add_head)

		return M_curr

	def build_read(M_curr,weight_curr):
                """
                Obtain read vector r_t (Equation (2) in paper)
                """
		return T.dot(weight_curr, M_curr)

	def shift_convolve(weight,shift):
                """
                Circular convolution (Equation (8) in paper)
                """
		shift = shift.dimshuffle((0,'x'))
		return T.sum(shift * weight[shift_conv],axis=0)

	def build_head_curr(weight_prev,M_curr,head,input_curr):
                """
                Implement addressing mechanism shown in Figure 2 in paper.
                Also return add and erase vectors computed by head.
                """
                # input_curr is hidden layer from controller
                # this is passing the hidden layer into the heads layer
                # which computes key, beta, g, shift, gamma, erase, and add
                # as outputs (see head_params in head.py)
		key,beta,g,shift,gamma,erase,add = head(input_curr)

		# 3.3.1 Focusing b Content (Equation (5))
		weight_c = U.vector_softmax(beta * similarity(key,M_curr))
		weight_c.name = "weight_c"

		# 3.3.2 Focusing by Location (Equation (7))
		weight_g       = g * weight_c + (1 - g) * weight_prev
		weight_g.name = "weight_g"

                # Equation (8)
		weight_shifted = shift_convolve(weight_g,shift)

                # Equation (9)
		weight_sharp   = weight_shifted ** gamma
		weight_curr    = weight_sharp / T.sum(weight_sharp)

		return weight_curr,erase,add

	def step(input_curr,M_prev,weight_prev):
                """
                Update the weights and memory from the previous time step
                given the current input
                """
                # Get read vector r_t
		read_prev = build_read(M_prev,weight_prev)

                # Feed current input and read input to controller to get
                # controller output and hidden layer of controller
		output,controller_hidden = controller(input_curr,read_prev)

                # Obtain new weight vector (as described in figure 2) and erase and add vectors
                weight_curr,erase,add = build_head_curr(weight_prev,M_prev,heads,controller_hidden)
                # Update memory with current weight, erase, and add vectors (Section 3.2 in paper)
                M_curr = build_memory_curr(M_prev,erase,add,weight_curr)

		return M_curr,weight_curr,output
	return step,[memory_init,weight_init,None]