def __init__(self, args, pretrained):
        super(Transfrmr_bidaf, self).__init__()
        self.embed = embed.Embedding(args, pretrained)

        # Encoder module
        self.encoder_ctxt = encode.Encoder_block(args, 2 * args.word_dim)
        self.encoder_ques = encode.Encoder_block(args, 2 * args.word_dim)

        #Attention Flow Layer
        self.att_weight_c = Linear(args.hidden_size * 2, 1, args.dropout)
        self.att_weight_q = Linear(args.hidden_size * 2, 1, args.dropout)
        self.att_weight_cq = Linear(args.hidden_size * 2, 1, args.dropout)
        self.N = args.Model_encoder_size
        self.dropout = nn.Dropout(p=args.dropout)

        #Model Encoding Layer
        self.Model_encoder = self.get_clones(
            encode.Encoder_block(args, 8 * args.word_dim),
            args.Model_encoder_size)
        # self.Model2start= Linear(16 * args.word_dim, 8 * args.word_dim,args.dropout)
        # self.Model2end = Linear(16 * args.word_dim, 8 * args.word_dim,args.dropout)
        # self.start_idx = Linear(16 * args.word_dim,1,args.dropout)
        # self.end_idx = Linear(16 * args.word_dim, 1, args.dropout)
        self.start_idx = nn.Linear(16 * args.word_dim, 1)
        self.end_idx = nn.Linear(16 * args.word_dim, 1)
Example #2
0
    def __init__(self, context_length, embedding_size, dropout=0.0):
        """
        Initialise parameters and layers for Predictor.

        :param context_length: length of the context
        :param embedding_size: hidden embedding size (d2 in the paper)
        :param dropout: dropout rate
        """
        super(Predictor, self).__init__()  # the following build on this

        d2 = embedding_size

        self.f0 = nn.LSTM(d2, d2)  # input_site, output_size
        self.f1 = nn.LSTM(2 * d2, d2)
        self.f2 = nn.LSTM(3 * d2, d2)
        self.f3 = nn.LSTM(3 * d2, d2)

        self.linear_sup = Linear(
            d2, 2, dropout=dropout
        )  # have 2 output dims because we need to weight the classes
        self.linear_start = Linear(
            d2, 1, dropout=dropout
        )  # with a softmax because there can only be one start or end
        self.linear_end = Linear(d2, 1, dropout=dropout)
        self.linear_type = Linear(
            d2, 3,
            dropout=dropout)  # 3 because we have 3 types - yes, no, and span
Example #3
0
            def global_attention(query):
                # linear map
                y = Linear(query, global_attention_vec_size, True)
                y = y.view(-1, 1, 1, global_attention_vec_size)
                # Attention mask is a softmax of v_g^{\top} * tanh(...)
                s = torch.sum(global_v *
                              torch.tanh(global_hidden_features + y),
                              dim=[1, 3])
                a = tf.softmax(s)

                return a
Example #4
0
 def local_attention(query):
     # linear map
     y = Linear(query, local_attention_vec_size, True)
     y = y.view(-1, 1, 1, local_attention_vec_size)
     # Attention mask is a softmax of v_l^{\top} * tanh(...)
     #print((local_v * torch.tanh(local_hidden_features + y)).size())
     s = torch.sum(local_v * torch.tanh(local_hidden_features + y),
                   dim=[1, 3])
     # Now calculate the attention-weighted vector, i.e., alpha in eq.[2]
     a = tf.softmax(s)
     return a
Example #5
0
        def attention(query):
            # linear map
            y = Linear(query, attention_vec_size, True)
            y = y.view(-1, 1, 1, attention_vec_size)
            # Attention mask is a softmax of v_d^{\top} * tanh(...).
            s = torch.sum(v * torch.tanh(hidden_features + y), dim=[1, 3])
            # Now calculate the attention-weighted vector, i.e., gamma in eq.[7]
            a = tf.softmax(s)
            # eq. [8]
            #print(hidden.size())
            #print((a.view(-1, 1, attn_length, 1)).size())
            d = torch.sum(a.view(-1, 1, attn_length, 1) * hidden, dim=[2, 3])

            return d.view(-1, attn_size)
Example #6
0
 def global_attention(query):
     """Put attention masks on global_hidden using global_hidden_features and query."""
     # If the query is a tuple (when stacked RNN/LSTM), flatten it
     if nest.is_sequence(query):
         query_list = nest.flatten(query)
         for q in query_list:  # Check that ndims == 2 if specified.
             ndims = q.get_shape().ndims
             if ndims:
                 assert ndims == 2
         query = array_ops.concat(query_list, 1)
     with tf.variable_scope("AttnWg"):
         # linear map
         y = Linear(query, global_attention_vec_size, True)
         y = array_ops.reshape(
             y, [-1, 1, 1, global_attention_vec_size])
         # Attention mask is a softmax of v_g^{\top} * tanh(...)
         s = math_ops.reduce_sum(
             global_v *
             math_ops.tanh(global_hidden_features + y),
             [2, 3])
         # Sometimes it's not easy to find a measurement to denote similarity between sensors,
         # here we omit such prior knowledge in eq.[4].
         # You can use "a = nn_ops.softmax((1-lambda)*s + lambda*sim)" to encode similarity info,
         # where:
         #     sim: a vector with length n_sensors, describing the sim between the target sensor and the others
         #     lambda: a trade-off.
         a = nn_ops.softmax(s)
     # a = nn_ops.softmax((1 - lambda) * s + lambda * sim)
     return a
Example #7
0
    def step(self, CurrentInput, PrevState):
        """
        Takes an input and the previous 'state' of the NTM and returns the output
        and the next state. The 'states' are a tuple of two lists of weights (in order
        of read heads, then write heads), and the memory (as a matrix).

        We may also want to see how the memory is being accessed at each step, in which
        case we would append the read and write vectors to the output.

        Weights: list of shape (1,MemoryLength)
        Memory: shape (MemoryDepth, MemoryLength)
        CurrentInput: shape (1,InputDepth)
        """
        ReadWeights, WriteWeights, Memory = PrevState
        ReadInputs = [ReadMemory(W, Memory) for W in ReadWeights]
        ControlInput = tf.concat(1, ReadInputs + [CurrentInput])

        # now we should put in a control network that takes ControlInput-size inputs and
        ControlState = tf.tanh(
            Linear(ControlInput, self.Params.ControlHiddenSize, 'Controller'))
        # returns a 'control-state'

        Output = tf.sigmoid(
            Linear(ControlState, self.Params.InputDepth, 'Output'))

        NextReadWeights = []
        NextWriteWeights = []
        Adds = []
        Erases = []
        for i in xrange(self.Params.nReadHeads):
            with tf.variable_scope('ReadHead%d' % i):
                NextReadWeights.append(
                    self.HeadUpdate(ControlState, ReadWeights[i], Memory))
        for i in xrange(self.Params.nWriteHeads):
            with tf.variable_scope('WriteHead%d' % i):
                W, E, A = self.HeadUpdate(ControlState,
                                          WriteWeights[i],
                                          Memory,
                                          IsWrite=True)
            NextWriteWeights.append(W)
            Erases.append(E)
            Adds.append(A)
        for i in xrange(self.Params.nWriteHeads):
            Memory = WriteMemory(NextWriteWeights[i], Erases[i], Adds[i],
                                 Memory)

        return Output, (NextReadWeights, NextWriteWeights, Memory)
Example #8
0
    def HeadUpdate(self, ControlState, PrevWeights, Memory, IsWrite=False):
        """
        For one head, takes the control state, previous weight and memory, and outputs
        the new weight, and for write-heads, the erase and add vectors as well.
        """
        KeyVector = tf.tanh(
            Linear(ControlState, self.Params.MemoryDepth, 'KeyVector'))
        KeyStrength = tf.nn.softplus(Linear(ControlState, 1, 'KeyStrength'))
        Gate = tf.sigmoid(Linear(ControlState, 1, 'Gate'))
        ShiftWeights = tf.nn.softmax(
            Linear(ControlState, len(self.Params.ShiftOffsets),
                   'ShiftWeights'))
        Sharpen = tf.nn.softplus(Linear(ControlState, 1, 'Sharpen')) + 1.

        Weights = tf.exp(KeyStrength * cosine_similarity(KeyVector, Memory))
        Weights /= tf.reduce_sum(Weights, 1)
        Weights = Gate * Weights + (1.0 - Gate) * PrevWeights
        Weights = circular_convolution(Weights, ShiftWeights,
                                       self.Params.ShiftOffsets)
        Weights = tf.pow(Weights, Sharpen)
        Weights /= tf.reduce_sum(Weights, 1)

        if IsWrite:
            Erase = tf.sigmoid(
                Linear(ControlState, self.Params.MemoryDepth, 'Erase'))
            Add = tf.tanh(Linear(ControlState, self.Params.MemoryDepth, 'Add'))
            return Weights, Erase, Add
        else:
            return Weights
Example #9
0
    def __init__(self, hpm, rand_unif_init, rand_norm_init):
        self.hpm = hpm
        self.rand_unif_init = rand_unif_init
        self.rand_norm_init = rand_norm_init

        with tf.variable_scope('encoder'):
            self.lstm_cell_fw = tf.contrib.rnn.LSTMCell(
                self.hpm["hidden_size"],
                state_is_tuple=True,
                initializer=self.rand_unif_init)  # forward lstm cell
            self.lstm_cell_bw = tf.contrib.rnn.LSTMCell(
                self.hpm["hidden_size"],
                state_is_tuple=True,
                initializer=self.rand_unif_init)  # backward lstm cell

            self.w_c = Linear(
                self.hpm['hidden_size'], True, "reduce_c", self.rand_norm_init
            )  # Parameters for the concatenated state linear transf.
            self.w_h = Linear(
                self.hpm['hidden_size'], True, 'reduce_h', self.rand_norm_init
            )  # Parameters for the concatenated hidden output linear transf.
Example #10
0
    def __init__(self, n_enc_1, n_enc_2, n_enc_3, n_dec_1, n_dec_2, n_dec_3,
                 n_input, n_z):
        super(AE, self).__init__()

        # encoder
        self.enc_1 = Linear(n_input, n_enc_1)
        self.enc_2 = Linear(n_enc_1, n_enc_2)
        self.enc_3 = Linear(n_enc_2, n_enc_3)

        self.z_layer = Linear(n_enc_3, n_z)

        # decoder
        self.dec_1 = Linear(n_z, n_dec_1)
        self.dec_2 = Linear(n_dec_1, n_dec_2)
        self.dec_3 = Linear(n_dec_2, n_dec_3)

        self.x_bar_layer = Linear(n_dec_3, n_input)
 def local_attention(query):
     """Put attention masks on local_hidden using local_hidden_features and query."""
     # If the query is a tuple (when stacked RNN/LSTM), flatten it
     if nest.is_sequence(query):
         query_list = nest.flatten(query)
         for q in query_list:
             ndims = q.get_shape().ndims
             if ndims:
                 assert ndims == 2
         query = array_ops.concat(query_list, 1)
     with tf.variable_scope("AttnWl"):
         # linear map
         y = Linear(query, local_attention_vec_size, True)
         y = array_ops.reshape(
             y, [-1, 1, 1, local_attention_vec_size])
         # Attention mask is a softmax of v_l^{\top} * tanh(...)
         s = math_ops.reduce_sum(
             local_v * math_ops.tanh(local_hidden_features + y), [2, 3])
         # Now calculate the attention-weighted vector, i.e., alpha in eq.[2]
         a = nn_ops.softmax(s)
     return a
Example #12
0
    def __init__(self, hpm, rand_unif_init, rand_norm_init):
        self.rand_unif_init = rand_unif_init
        self.rand_norm_init = rand_norm_init
        self.hpm = hpm

        with tf.variable_scope('attention_decoder', reuse=tf.AUTO_REUSE):
            self.decoder = Decoder(
                self.hpm, self.rand_unif_init
            )  # simple decoder object (unidirecitional lstm)

            # Almost all the parameters (weights and biases) for the linear transformations (see below in the call method)

            self.w_h = Linear(self.hpm['attn_hidden_size'], True, "h")
            self.w_s = Linear(self.hpm['attn_hidden_size'], True, "s")
            self.v = Linear(1, False, 'V')

            self.w_dec = Linear(self.hpm['emb_size'], True, "dec_inp")
            self.w_out = Linear(self.hpm['vocab_size'], True, 'out')

            if self.hpm['pointer_gen']:
                self.w_c_reduce = Linear(1, True, 'c_reduce')
                self.w_s_reduce = Linear(1, True, 's_reduce')
                self.w_i_reduce = Linear(1, True, 'i_reduce')
Example #13
0
 def attention(query):
     """Put attention masks on local_hidden using local_hidden_features and query."""
     # If the query is a tuple (when stacked RNN/LSTM), flatten it
     if nest.is_sequence(query):
         query_list = nest.flatten(query)
         for q in query_list:  # Check that ndims == 2 if specified.
             ndims = q.get_shape().ndims
             if ndims:
                 assert ndims == 2
         query = array_ops.concat(query_list, 1)
     with vs.variable_scope("Attn_Wpd"):
         # linear map
         y = Linear(query, attention_vec_size, True)
         y = array_ops.reshape(y, [-1, 1, 1, attention_vec_size])
         # Attention mask is a softmax of v_d^{\top} * tanh(...).
         s = math_ops.reduce_sum(
             v * math_ops.tanh(hidden_features + y), [2, 3])
         # Now calculate the attention-weighted vector, i.e., gamma in eq.[7]
         a = nn_ops.softmax(s)
         # eq. [8]
         d = math_ops.reduce_sum(
             array_ops.reshape(a, [-1, attn_length, 1, 1]) * hidden,
             [1, 2])
     return array_ops.reshape(d, [-1, attn_size])
Example #14
0
    def __init__(self, args, pretrained):
        super(BiDAF, self).__init__()
        self.args = args

        # 1. Character Embedding Layer
        self.char_embedding = nn.Embedding(args.char_vocab_size,
                                           args.char_dim,
                                           padding_idx=1)
        nn.init.uniform_(self.char_embedding.weight, -0.001, 0.001)

        self.char_convolution = nn.Sequential(
            nn.Conv2d(1, args.char_channel_size,
                      (args.char_dim, args.char_channel_width)), nn.ReLU())

        # 2. Word Embedding Layer
        # initialize word embedding with GloVe
        self.word_embedding = nn.Embedding.from_pretrained(pretrained,
                                                           freeze=True)

        # highway network
        assert self.args.hidden_size * 2 == (self.args.char_channel_size +
                                             self.args.word_dim)
        for i in range(2):
            setattr(
                self, 'highway_linear{}'.format(i),
                nn.Sequential(
                    Linear(args.hidden_size * 2, args.hidden_size * 2),
                    nn.ReLU()))
            setattr(
                self, 'highway_gate{}'.format(i),
                nn.Sequential(
                    Linear(args.hidden_size * 2, args.hidden_size * 2),
                    nn.Sigmoid()))

        # 3. Contextual Embedding Layer
        self.context_LSTM = LSTM(input_size=args.hidden_size * 2,
                                 hidden_size=args.hidden_size,
                                 bidirectional=True,
                                 batch_first=True,
                                 dropout=args.dropout)

        # 4. Attention Flow Layer
        self.att_weight_c = Linear(args.hidden_size * 2, 1)
        self.att_weight_q = Linear(args.hidden_size * 2, 1)
        self.att_weight_cq = Linear(args.hidden_size * 2, 1)

        # 5. Modeling Layer
        self.modeling_LSTM1 = LSTM(input_size=args.hidden_size * 8,
                                   hidden_size=args.hidden_size,
                                   bidirectional=True,
                                   batch_first=True,
                                   dropout=args.dropout)

        self.modeling_LSTM2 = LSTM(input_size=args.hidden_size * 2,
                                   hidden_size=args.hidden_size,
                                   bidirectional=True,
                                   batch_first=True,
                                   dropout=args.dropout)

        # 6. Output Layer
        self.p1_weight_g = Linear(args.hidden_size * 8,
                                  1,
                                  dropout=args.dropout)
        self.p1_weight_m = Linear(args.hidden_size * 2,
                                  1,
                                  dropout=args.dropout)
        self.p2_weight_g = Linear(args.hidden_size * 8,
                                  1,
                                  dropout=args.dropout)
        self.p2_weight_m = Linear(args.hidden_size * 2,
                                  1,
                                  dropout=args.dropout)

        self.output_LSTM = LSTM(input_size=args.hidden_size * 2,
                                hidden_size=args.hidden_size,
                                bidirectional=True,
                                batch_first=True,
                                dropout=args.dropout)

        self.dropout = nn.Dropout(p=args.dropout)
Example #15
0
    def temporal_attention(self,
                           decoder_inputs,
                           external_inputs,
                           initial_state,
                           attention_states,
                           cell,
                           output_size=None,
                           loop_function=None,
                           dtype=tf.float32,
                           scope=None,
                           initial_state_attention=False,
                           external_flag=True):
        """ Temporal attention in GeoMAN
		Args:
			decoder_inputs: A list (length: n_steps_decoder) of 2D Tensors [batch_size, n_input_decoder].
			external_inputs: A list (length: n_steps_decoder) of 2D Tensors [batch_size, n_external_input].
			initial_state: 2D Tensor [batch_size, cell.state_size].
			attention_states: 3D Tensor [batch_size, n_step_encoder, n_hidden_encoder].
			cell: core_rnn_cell.RNNCell defining the cell function and size.
			output_size: Size of the output vectors; if None, we use cell.output_size.
			loop_function: the loop function we use.
			dtype: The dtype to use for the RNN initial state (default: tf.float32).
			scope: VariableScope for the created subgraph; default: "tempotal_attention".
			initial_state_attention: If False (default), initial attentions are zero.
			external_flag: whether to use external factors

		Return:
			A tuple of the form (outputs, state), where:
				outputs: A list of the same length as the inputs of decoder of 2D Tensors of
						 shape [batch_size x output_size]
				state: The state of each decoder cell the final time-step.
		"""
        # check inputs
        if not decoder_inputs:
            raise ValueError(
                "Must provide at least 1 input to attention decoder.")
        if not external_inputs:
            raise ValueError(
                "Must provide at least 1 ext_input to attention decoder.")
        if attention_states.get_shape()[2].value is None:
            raise ValueError("Shape[2] of attention_states must be known: %s" %
                             attention_states.get_shape())
        if output_size is None:
            output_size = cell.output_size

        # implement of temporal attention
        with vs.variable_scope(scope or "temporal_attn", dtype=dtype) as scope:
            dtype = scope.dtype
            # Needed for reshaping.
            batch_size = array_ops.shape(decoder_inputs[0])[0]
            attn_length = attention_states.get_shape()[1].value
            if attn_length is None:
                attn_length = array_ops.shape(attention_states)[1]
            attn_size = attention_states.get_shape()[2].value

            # A trick: to calculate W_d * h_o by a 1-by-1 convolution
            # See at eq.[6] in the paper
            hidden = array_ops.reshape(
                attention_states,
                [-1, attn_length, 1, attn_size])  # need to reshape before
            # Size of query vectors for attention.
            attention_vec_size = attn_size
            w = vs.get_variable("Attn_Wd",
                                [1, 1, attn_size, attention_vec_size])  # W_d
            hidden_features = nn_ops.conv2d(hidden, w, [1, 1, 1, 1],
                                            "SAME")  # W_d * h_o
            v = vs.get_variable("Attn_v", [attention_vec_size])  # v_d
            state = initial_state

            def attention(query):
                """Put attention masks on local_hidden using local_hidden_features and query."""
                # If the query is a tuple (when stacked RNN/LSTM), flatten it
                if nest.is_sequence(query):
                    query_list = nest.flatten(query)
                    for q in query_list:  # Check that ndims == 2 if specified.
                        ndims = q.get_shape().ndims
                        if ndims:
                            assert ndims == 2
                    query = array_ops.concat(query_list, 1)
                with vs.variable_scope("Attn_Wpd"):
                    # linear map
                    y = Linear(query, attention_vec_size, True)
                    y = array_ops.reshape(y, [-1, 1, 1, attention_vec_size])
                    # Attention mask is a softmax of v_d^{\top} * tanh(...).
                    s = math_ops.reduce_sum(
                        v * math_ops.tanh(hidden_features + y), [2, 3])
                    # Now calculate the attention-weighted vector, i.e., gamma in eq.[7]
                    a = nn_ops.softmax(s)
                    # eq. [8]
                    d = math_ops.reduce_sum(
                        array_ops.reshape(a, [-1, attn_length, 1, 1]) * hidden,
                        [1, 2])
                return array_ops.reshape(d, [-1, attn_size])

            if initial_state_attention:
                attn = attention(initial_state)
            else:
                batch_attn_size = array_ops.stack([batch_size, attn_size])
                attn = array_ops.zeros(batch_attn_size, dtype=dtype)
                attn.set_shape([None, attn_size])

            i = 0
            outputs = []
            prev = None
            for inp, ext_inp in zip(decoder_inputs, external_inputs):
                if i > 0:
                    vs.get_variable_scope().reuse_variables()
                # If loop_function is set, we use it instead of decoder_inputs.
                if loop_function is not None and prev is not None:
                    with vs.variable_scope("loop_function", reuse=True):
                        inp = loop_function(prev, i)
                # Merge input and previous attentions into one vector of the right size.
                input_size = inp.get_shape().with_rank(2)[1]
                if input_size.value is None:
                    raise ValueError(
                        "Could not infer input size from input: %s" % inp.name)
                # we map the concatenation to shape [batch_size, input_size]
                if external_flag:
                    x = Linear([inp] + [ext_inp] + [attn], input_size, True)
                else:
                    x = Linear([inp] + [attn], input_size, True)
                # Run the RNN.
                cell_output, state = cell(x, state)
                # Run the attention mechanism.
                if i == 0 and initial_state_attention:
                    with vs.variable_scope(vs.get_variable_scope(),
                                           reuse=True):
                        attn = attention(state)
                else:
                    attn = attention(state)
                # Attention output projection
                with vs.variable_scope("AttnOutputProjection"):
                    output = Linear([cell_output] + [attn], output_size, True)
                if loop_function is not None:
                    prev = output
                outputs.append(output)
                i += 1
        return outputs, state
Example #16
0
 def __init__(self, output_num, normalize, linear_size, input_size):
     super(Projection, self).__init__()
     self.normalize = normalize
     self.l1 = Linear(input_size, linear_size, bn=False, activ='relu')
     self.l2 = Linear(linear_size, output_num, bn=False, activ=None)
Example #17
0
 def __init__(self, num_classes, input_size):
     super(Output, self).__init__()
     self.l1 = Linear(input_size, num_classes, bn=False, activ=None)
Example #18
0
    def temporal_attention(self,
                           decoder_inputs,
                           external_inputs,
                           encoder_state,
                           attention_states,
                           cell,
                           external_flag,
                           output_size=64):
        # Needed for reshaping.
        batch_size = decoder_inputs[0].data.size(0)
        attn_length = attention_states.data.size(1)
        attn_size = attention_states.data.size(2)

        # A trick: to calculate W_d * h_o by a 1-by-1 convolution
        # See at eq.[6] in the paper
        hidden = attention_states.view(-1, attn_size, attn_length,
                                       1)  # need to reshape before
        # Size of query vectors for attention.
        attention_vec_size = attn_size
        w_conv = nn.Conv2d(attn_size, attention_vec_size, (1, 1), (1, 1))
        hidden_features = w_conv(hidden)
        #v = Variable(torch.zeros(attention_vec_size)) # v_l
        v = nn.Parameter(torch.FloatTensor(attention_vec_size))
        init.normal(v)

        def attention(query):
            # linear map
            y = Linear(query, attention_vec_size, True)
            y = y.view(-1, 1, 1, attention_vec_size)
            # Attention mask is a softmax of v_d^{\top} * tanh(...).
            s = torch.sum(v * torch.tanh(hidden_features + y), dim=[1, 3])
            # Now calculate the attention-weighted vector, i.e., gamma in eq.[7]
            a = tf.softmax(s)
            # eq. [8]
            #print(hidden.size())
            #print((a.view(-1, 1, attn_length, 1)).size())
            d = torch.sum(a.view(-1, 1, attn_length, 1) * hidden, dim=[2, 3])

            return d.view(-1, attn_size)

        #attn = Variable(torch.zeros(batch_size, attn_size))
        attn = nn.Parameter(torch.FloatTensor(batch_size, attn_size))
        init.xavier_uniform(attn)

        i = 0
        outputs = []
        prev = None

        for (inp, ext_inp) in zip(decoder_inputs, external_inputs):
            # Merge input and previous attentions into one vector of the right size.

            input_size = inp.data.size(1)
            #print(i, input_size)
            #input_size是指向量维度
            # we map the concatenation to shape [batch_size, input_size]
            if external_flag:
                #print(inp.data.size(1),ext_inp.data.size(1),attn.data.size(1))
                x = Linear([inp.float()] + [ext_inp.float()] + [attn.float()],
                           input_size, True)
            else:
                x = Linear([inp.float()] + [attn.float()], input_size, True)
            # Run the RNN.
            #print(x.size())
            cell_output, state = cell(x)
            # Run the attention mechanism.
            #print(state.size())
            attn = attention([state])

            # Attention output projection
            #print(cell_output.size(), attn.size())
            output = Linear([cell_output] + [attn], output_size, True)
            outputs.append(output)
            i += 1
        return outputs, state
 def __init__(self, hidden_size):
     super(BiATT, self).__init__()
     self.att_weight_c = Linear(hidden_size * 2, 1)
     self.att_weight_q = Linear(hidden_size * 2, 1)
     self.att_weight_cq = Linear(hidden_size * 2, 1)