Ejemplos de CosineMatrixAttention en Python, ejemplos de allennlp.modules.matrix_attention.cosine_matrix_attention.CosineMatrixAttention en Python

Ejemplo n.º 1

0

Mostrar archivo

    def __init__(self, word_embeddings: TextFieldEmbedder, bin_count: int):

        super(DRMM, self).__init__()

        self.word_embeddings = word_embeddings
        self.cosine_module = CosineMatrixAttention()

        self.bin_count = bin_count
        self.matching_classifier = FeedForward(
            input_dim=bin_count,
            num_layers=2,
            hidden_dims=[bin_count, 1],
            activations=[
                Activation.by_name('tanh')(),
                Activation.by_name('tanh')()
            ])
        self.query_gate = FeedForward(
            input_dim=self.word_embeddings.get_output_dim(),
            num_layers=2,
            hidden_dims=[self.word_embeddings.get_output_dim(), 1],
            activations=[
                Activation.by_name('tanh')(),
                Activation.by_name('tanh')()
            ])
        self.query_softmax = MaskedSoftmax()

Ejemplo n.º 2

0

Mostrar archivo

Archivo: pacrr.py Proyecto: zhengd07/transformer-kernel-ranking

    def __init__(self,

                 unified_query_length:int,
                 unified_document_length:int,

                 max_conv_kernel_size: int, # 2 to n
                 conv_output_size: int, # conv output channels

                 kmax_pooling_size: int): # per query k-max pooling
                 
        super(PACRR,self).__init__()

        self.cosine_module = CosineMatrixAttention()

        self.unified_query_length = unified_query_length
        self.unified_document_length = unified_document_length

        self.convolutions = []
        for i in range(2, max_conv_kernel_size + 1):
            self.convolutions.append(
                nn.Sequential(
                    nn.ConstantPad2d((0,i - 1,0, i - 1), 0), # this outputs [batch,1,unified_query_length + i - 1 ,unified_document_length + i - 1]
                    nn.Conv2d(kernel_size=i, in_channels=1, out_channels=conv_output_size), # this outputs [batch,32,unified_query_length,unified_document_length]
                    nn.MaxPool3d(kernel_size=(conv_output_size,1,1)) # this outputs [batch,1,unified_query_length,unified_document_length]
            ))
        self.convolutions = nn.ModuleList(self.convolutions) # register conv as part of the model

        self.masked_softmax = MaskedSoftmax()
        self.kmax_pooling_size = kmax_pooling_size

        self.dense = nn.Linear(kmax_pooling_size * unified_query_length * max_conv_kernel_size, out_features=100, bias=True)
        self.dense2 = nn.Linear(100, out_features=10, bias=True)
        self.dense3 = nn.Linear(10, out_features=1, bias=False)

Ejemplo n.º 3

0

Mostrar archivo

Archivo: conv_knrm.py Proyecto: tuwien-information-retrieval/sigir19-neural-ir

    def __init__(self, word_embeddings: TextFieldEmbedder, n_grams: int,
                 n_kernels: int, conv_out_dim: int):

        super(Conv_KNRM, self).__init__()

        self.word_embeddings = word_embeddings

        # static - kernel size & magnitude variables
        self.mu = Variable(torch.cuda.FloatTensor(self.kernel_mus(n_kernels)),
                           requires_grad=False).view(1, 1, 1, n_kernels)
        self.sigma = Variable(torch.cuda.FloatTensor(
            self.kernel_sigmas(n_kernels)),
                              requires_grad=False).view(1, 1, 1, n_kernels)

        self.convolutions = []
        for i in range(1, n_grams + 1):
            self.convolutions.append(
                nn.Sequential(
                    nn.ConstantPad1d((0, i - 1), 0),
                    nn.Conv1d(kernel_size=i,
                              in_channels=word_embeddings.get_output_dim(),
                              out_channels=conv_out_dim), nn.ReLU()))
        self.convolutions = nn.ModuleList(
            self.convolutions)  # register conv as part of the model

        # this does not really do "attention" - just a plain cosine matrix calculation (without learnable weights)
        self.cosine_module = CosineMatrixAttention()

        # *9 because we concat the 3x3 conv match sums together before the dense layer
        self.dense = nn.Linear(n_kernels * n_grams * n_grams, 1, bias=False)

        # init with small weights, otherwise the dense output is way to high fot
        torch.nn.init.uniform_(self.dense.weight, -0.014,
                               0.014)  # inits taken from matchzoo

Ejemplo n.º 4

0

Mostrar archivo

Archivo: tk.py Proyecto: zhengd07/transformer-kernel-ranking

    def __init__(self, _embsize: int, kernels_mu: List[float],
                 kernels_sigma: List[float], att_heads: int, att_layer: int,
                 att_proj_dim: int, att_ff_dim: int, win_size: int,
                 max_windows: int):

        super(TK_v2, self).__init__()

        n_kernels = len(kernels_mu)

        if len(kernels_mu) != len(kernels_sigma):
            raise Exception("len(kernels_mu) != len(kernels_sigma)")

        # static - kernel size & magnitude variables
        self.mu = Variable(torch.cuda.FloatTensor(kernels_mu),
                           requires_grad=False).view(1, 1, 1, n_kernels)
        self.sigma = Variable(torch.cuda.FloatTensor(kernels_sigma),
                              requires_grad=False).view(1, 1, 1, n_kernels)
        self.mixer = nn.Parameter(
            torch.full([1, 1, 1], 0.5, dtype=torch.float32,
                       requires_grad=True))

        self.stacked_att = StackedSelfAttentionEncoder(
            input_dim=_embsize,
            hidden_dim=_embsize,
            projection_dim=att_proj_dim,
            feedforward_hidden_dim=att_ff_dim,
            num_layers=att_layer,
            num_attention_heads=att_heads,
            dropout_prob=0,
            residual_dropout_prob=0,
            attention_dropout_prob=0)

        # this does not really do "attention" - just a plain cosine matrix calculation (without learnable weights)
        self.cosine_module = CosineMatrixAttention()

        self.nn_scaler = nn.ParameterList([
            nn.Parameter(
                torch.full([1], 0.01, dtype=torch.float32, requires_grad=True))
            for w in win_size
        ])

        self.kernel_weights = nn.ModuleList(
            [nn.Linear(n_kernels, 1, bias=False) for w in win_size])

        self.window_size = win_size
        self.window_scorer = []
        for w in max_windows:
            l = nn.Linear(w, 1, bias=False)
            torch.nn.init.constant_(l.weight, 1 / w)
            self.window_scorer.append(l)

        self.window_scorer = nn.ModuleList(self.window_scorer)

        self.window_merger = nn.Linear(len(self.window_size), 1, bias=False)

Ejemplo n.º 5

0

Mostrar archivo

Archivo: tk.py Proyecto: zhengd07/transformer-kernel-ranking

    def __init__(self, _embsize: int, kernels_mu: List[float],
                 kernels_sigma: List[float], att_heads: int, att_layer: int,
                 att_proj_dim: int, att_ff_dim: int):

        super(TK_v1, self).__init__()

        n_kernels = len(kernels_mu)

        if len(kernels_mu) != len(kernels_sigma):
            raise Exception("len(kernels_mu) != len(kernels_sigma)")

        # static - kernel size & magnitude variables
        self.mu = Variable(torch.cuda.FloatTensor(kernels_mu),
                           requires_grad=False).view(1, 1, 1, n_kernels)
        self.sigma = Variable(torch.cuda.FloatTensor(kernels_sigma),
                              requires_grad=False).view(1, 1, 1, n_kernels)
        self.nn_scaler = nn.Parameter(
            torch.full([1], 0.01, dtype=torch.float32, requires_grad=True))
        self.mixer = nn.Parameter(
            torch.full([1, 1, 1], 0.5, dtype=torch.float32,
                       requires_grad=True))

        self.stacked_att = StackedSelfAttentionEncoder(
            input_dim=_embsize,
            hidden_dim=_embsize,
            projection_dim=att_proj_dim,
            feedforward_hidden_dim=att_ff_dim,
            num_layers=att_layer,
            num_attention_heads=att_heads,
            dropout_prob=0,
            residual_dropout_prob=0,
            attention_dropout_prob=0)

        # this does not really do "attention" - just a plain cosine matrix calculation (without learnable weights)
        self.cosine_module = CosineMatrixAttention()

        # bias is set to True in original code (we found it to not help, how could it?)
        self.dense = nn.Linear(n_kernels, 1, bias=False)
        self.dense_mean = nn.Linear(n_kernels, 1, bias=False)
        self.dense_comb = nn.Linear(2, 1, bias=False)

        # init with small weights, otherwise the dense output is way to high for the tanh -> resulting in loss == 1 all the time
        torch.nn.init.uniform_(self.dense.weight, -0.014,
                               0.014)  # inits taken from matchzoo
        torch.nn.init.uniform_(self.dense_mean.weight, -0.014,
                               0.014)  # inits taken from matchzoo

        # init with small weights, otherwise the dense output is way to high for the tanh -> resulting in loss == 1 all the time
        torch.nn.init.uniform_(self.dense.weight, -0.014,
                               0.014)  # inits taken from matchzoo

Ejemplo n.º 6

0

Mostrar archivo

Archivo: duet.py Proyecto: zhengd07/transformer-kernel-ranking

    def __init__(self, word_embeddings_out_dim: int):

        super(Duet, self).__init__()

        NUM_HIDDEN_NODES = word_embeddings_out_dim
        POOLING_KERNEL_WIDTH_QUERY = 18
        POOLING_KERNEL_WIDTH_DOC = 100
        DROPOUT_RATE = 0

        NUM_POOLING_WINDOWS_DOC = 99
        MAX_DOC_TERMS = 200
        MAX_QUERY_TERMS = 30
        self.cosine_module = CosineMatrixAttention()

        self.duet_local = nn.Sequential(
            nn.Conv1d(MAX_DOC_TERMS, NUM_HIDDEN_NODES, kernel_size=1),
            nn.ReLU(), Flatten(), nn.Dropout(p=DROPOUT_RATE),
            nn.Linear(NUM_HIDDEN_NODES * MAX_QUERY_TERMS, NUM_HIDDEN_NODES),
            nn.ReLU(), nn.Dropout(p=DROPOUT_RATE),
            nn.Linear(NUM_HIDDEN_NODES, NUM_HIDDEN_NODES), nn.ReLU(),
            nn.Dropout(p=DROPOUT_RATE))
        self.duet_dist_q = nn.Sequential(
            nn.Conv1d(NUM_HIDDEN_NODES, NUM_HIDDEN_NODES, kernel_size=3),
            nn.ReLU(), nn.MaxPool1d(POOLING_KERNEL_WIDTH_QUERY), Flatten(),
            nn.Linear(NUM_HIDDEN_NODES, NUM_HIDDEN_NODES), nn.ReLU())
        self.duet_dist_d = nn.Sequential(
            nn.Conv1d(NUM_HIDDEN_NODES, NUM_HIDDEN_NODES, kernel_size=3),
            nn.ReLU(), nn.MaxPool1d(POOLING_KERNEL_WIDTH_DOC, stride=1),
            nn.Conv1d(NUM_HIDDEN_NODES, NUM_HIDDEN_NODES, kernel_size=1),
            nn.ReLU())
        self.duet_dist = nn.Sequential(
            Flatten(), nn.Dropout(p=DROPOUT_RATE),
            nn.Linear(NUM_HIDDEN_NODES * NUM_POOLING_WINDOWS_DOC,
                      NUM_HIDDEN_NODES), nn.ReLU(), nn.Dropout(p=DROPOUT_RATE),
            nn.Linear(NUM_HIDDEN_NODES, NUM_HIDDEN_NODES), nn.ReLU(),
            nn.Dropout(p=DROPOUT_RATE))
        self.duet_comb = nn.Sequential(
            nn.Linear(NUM_HIDDEN_NODES, NUM_HIDDEN_NODES), nn.ReLU(),
            nn.Dropout(p=DROPOUT_RATE),
            nn.Linear(NUM_HIDDEN_NODES, NUM_HIDDEN_NODES), nn.ReLU(),
            nn.Dropout(p=DROPOUT_RATE), nn.Linear(NUM_HIDDEN_NODES, 1),
            nn.ReLU())

        #self.scale                  = nn.Parameter(torch.tensor([0.1]), requires_grad=True)

        def init_normal(m):
            if type(m) == nn.Linear:
                nn.init.uniform_(m.weight, 0, 0.01)

        self.duet_comb.apply(init_normal)

Ejemplo n.º 7

0

Mostrar archivo

Archivo: model_knrm.py Proyecto: bgeszti/ML

    def __init__(self, word_embeddings: TextFieldEmbedder, n_kernels: int):

        super(KNRM, self).__init__()

        self.word_embeddings = word_embeddings

        # static - kernel size & magnitude variables
        self.mu = Variable(torch.FloatTensor(self.kernel_mus(n_kernels)),
                           requires_grad=False).view(1, 1, 1, n_kernels)
        self.sigma = Variable(torch.FloatTensor(self.kernel_sigmas(n_kernels)),
                              requires_grad=False).view(1, 1, 1, n_kernels)

        #Cosine matrix
        self.cosine_module = CosineMatrixAttention()
        # Initialize the Linear transformer model:
        self.transform = nn.Linear(n_kernels, out_features=1, bias=True)

Ejemplo n.º 8

0

Mostrar archivo

Archivo: matchpyramid.py Proyecto: tuwien-information-retrieval/sigir19-neural-ir

    def __init__(self, word_embeddings: TextFieldEmbedder,
                 conv_output_size: List[int],
                 conv_kernel_size: List[Tuple[int, int]],
                 adaptive_pooling_size: List[Tuple[int, int]]):

        super(MatchPyramid, self).__init__()

        self.word_embeddings = word_embeddings
        self.cosine_module = CosineMatrixAttention()
        #self.cosine_module = DotProductMatrixAttention()

        if len(conv_output_size) != len(conv_kernel_size) or len(
                conv_output_size) != len(adaptive_pooling_size):
            raise Exception(
                "conv_output_size, conv_kernel_size, adaptive_pooling_size must have the same length"
            )

        conv_layer_dict = OrderedDict()
        last_channel_out = 1
        for i in range(len(conv_output_size)):
            conv_layer_dict["pad " + str(i)] = nn.ConstantPad2d(
                (0, conv_kernel_size[i][0] - 1, 0, conv_kernel_size[i][1] - 1),
                0)
            conv_layer_dict["conv " + str(i)] = nn.Conv2d(
                kernel_size=conv_kernel_size[i],
                in_channels=last_channel_out,
                out_channels=conv_output_size[i])
            conv_layer_dict["relu " + str(i)] = nn.ReLU()
            conv_layer_dict["pool " + str(i)] = nn.AdaptiveMaxPool2d(
                adaptive_pooling_size[i]
            )  # this is strange - but so written in the paper
            # would think only to pool at the end ??
            last_channel_out = conv_output_size[i]

        self.conv_layers = nn.Sequential(conv_layer_dict)

        #self.dropout = nn.Dropout(0)

        self.dense = nn.Linear(conv_output_size[-1] *
                               adaptive_pooling_size[-1][0] *
                               adaptive_pooling_size[-1][1],
                               out_features=100,
                               bias=True)
        self.dense2 = nn.Linear(100, out_features=10, bias=True)
        self.dense3 = nn.Linear(10, out_features=1, bias=False)

Ejemplo n.º 9

0

Mostrar archivo

    def __init__(self,
                 vocab: Vocabulary,
                 text_field_embedder: TextFieldEmbedder,
                 num_highway_layers: int,
                 phrase_layer: Seq2SeqEncoder,
                 modeling_layer: Seq2SeqEncoder,
                 span_end_encoder: Seq2SeqEncoder,
                 dropout: float = 0.2,
                 mask_lstms: bool = True,
                 initializer: InitializerApplicator = InitializerApplicator(),
                 regularizer: RegularizerApplicator = RegularizerApplicator()):
        super(BidirectionalAttentionFlow, self).__init__(vocab, regularizer)

        self._text_field_embedder = text_field_embedder
        self._highway_layer = TimeDistributed(
            Highway(text_field_embedder.get_output_dim(), num_highway_layers))
        self._phrase_layer = phrase_layer
        self._matrix_attention = CosineMatrixAttention()
        self._modeling_layer = modeling_layer
        self._span_end_encoder = span_end_encoder

        encoding_dim = phrase_layer.get_output_dim()
        modeling_dim = modeling_layer.get_output_dim()
        span_start_input_dim = encoding_dim * 4 + modeling_dim
        self._span_start_predictor = TimeDistributed(
            torch.nn.Linear(span_start_input_dim, 1))

        span_end_encoding_dim = span_end_encoder.get_output_dim()
        span_end_input_dim = encoding_dim * 4 + span_end_encoding_dim
        self._span_end_predictor = TimeDistributed(
            torch.nn.Linear(span_end_input_dim, 1))

        # Bidaf has lots of layer dimensions which need to match up - these aren't necessarily
        # obvious from the configuration files, so we check here.
        check_dimensions_match(modeling_layer.get_input_dim(),
                               4 * encoding_dim, "modeling layer input dim",
                               "4 * encoding dim")
        check_dimensions_match(text_field_embedder.get_output_dim(),
                               phrase_layer.get_input_dim(),
                               "text field embedder output dim",
                               "phrase layer input dim")
        check_dimensions_match(span_end_encoder.get_input_dim(),
                               4 * encoding_dim + 3 * modeling_dim,
                               "span end encoder input dim",
                               "4 * encoding dim + 3 * modeling dim")

        self._span_start_accuracy = CategoricalAccuracy()
        self._span_end_accuracy = CategoricalAccuracy()
        self._span_accuracy = BooleanAccuracy()
        self._squad_metrics = SquadEmAndF1()
        if dropout > 0:
            self._dropout = torch.nn.Dropout(p=dropout)
        else:
            self._dropout = lambda x: x
        self._mask_lstms = mask_lstms

        initializer(self)

Ejemplo n.º 10

0

Mostrar archivo

    def __init__(self, n_kernels: int):

        super(KNRM, self).__init__()

        # static - kernel size & magnitude variables
        self.mu = Variable(torch.cuda.FloatTensor(self.kernel_mus(n_kernels)),
                           requires_grad=False).view(1, 1, 1, n_kernels)
        self.sigma = Variable(torch.cuda.FloatTensor(
            self.kernel_sigmas(n_kernels)),
                              requires_grad=False).view(1, 1, 1, n_kernels)

        # this does not really do "attention" - just a plain cosine matrix calculation (without learnable weights)
        self.cosine_module = CosineMatrixAttention()

        # bias is set to True in original code (we found it to not help, how could it?)
        self.dense = nn.Linear(n_kernels, 1, bias=False)

        # init with small weights, otherwise the dense output is way to high for the tanh -> resulting in loss == 1 all the time
        torch.nn.init.uniform_(self.dense.weight, -0.014,
                               0.014)  # inits taken from matchzoo

Ejemplo n.º 11

0

Mostrar archivo

Archivo: model_conv_knrm.py Proyecto: bgeszti/ML

    def __init__(self,
                 word_embeddings: TextFieldEmbedder,
                 n_grams: int,
                 n_kernels: int,
                 conv_out_dim: int):

        super(Conv_KNRM, self).__init__()

        self.word_embeddings = word_embeddings

        # static - kernel size & magnitude variables
        self.mu = Variable(torch.FloatTensor(self.kernel_mus(n_kernels)), requires_grad = False).view(1, 1, 1, n_kernels)
        self.sigma = Variable(torch.FloatTensor(self.kernel_sigmas(n_kernels)), requires_grad = False).view(1, 1, 1,
                                                                                                          n_kernels)

        # Implement 1 Dimensional CNN layer for each n-gram type
        # Also, use RelU as Activation function
        self.convolutions = []
        for i in range (1, n_grams + 1):
            self.convolutions.append(nn.Sequential(
            nn.ConstantPad1d((0 , i-1 ), 0),
            # the kernel size of the convolutional layer is the same as the current i-gram(uni, bi, tri...) in the loop
            nn.Conv1d(kernel_size = i, in_channels = word_embeddings.get_output_dim(), out_channels = conv_out_dim),
            nn.ReLU()))
            # register conv as part of the model
        self.convolutions = nn.ModuleList(self.convolutions)

        #Cosine similarity matrix
        self.cosine_module = CosineMatrixAttention()


        # Initialize the Linear transformer model:
        # size of the input: number of elements in the soft-TF feautes * number of kernel products (
        # n_kernels *  n_grams * n_grams = all combination of match matrix creation
        # (n-gram pairs from query and document embeddings)
        # the output will be 1 sample
        # also use bias based on the paper formula (by default it's true but just to make sure)
        self.transform = nn.Linear(in_features = n_kernels * n_grams * n_grams, out_features = 1, bias = True)

Ejemplo n.º 12

0

Mostrar archivo

Archivo: matchpyramid.py Proyecto: zhengd07/transformer-kernel-ranking

    def __init__(self, conv_output_size: List[int],
                 conv_kernel_size: List[Tuple[int, int]],
                 adaptive_pooling_size: List[Tuple[int, int]]):

        super(MatchPyramid, self).__init__()

        self.cosine_module = CosineMatrixAttention()

        if len(conv_output_size) != len(conv_kernel_size) or len(
                conv_output_size) != len(adaptive_pooling_size):
            raise Exception(
                "conv_output_size, conv_kernel_size, adaptive_pooling_size must have the same length"
            )

        conv_layer_dict = OrderedDict()
        last_channel_out = 1
        for i in range(len(conv_output_size)):
            conv_layer_dict["pad " + str(i)] = nn.ConstantPad2d(
                (0, conv_kernel_size[i][0] - 1, 0, conv_kernel_size[i][1] - 1),
                0)
            conv_layer_dict["conv " + str(i)] = nn.Conv2d(
                kernel_size=conv_kernel_size[i],
                in_channels=last_channel_out,
                out_channels=conv_output_size[i])
            conv_layer_dict["relu " + str(i)] = nn.ReLU()
            conv_layer_dict["pool " + str(i)] = nn.AdaptiveMaxPool2d(
                adaptive_pooling_size[i])
            last_channel_out = conv_output_size[i]

        self.conv_layers = nn.Sequential(conv_layer_dict)

        self.dense = nn.Linear(conv_output_size[-1] *
                               adaptive_pooling_size[-1][0] *
                               adaptive_pooling_size[-1][1],
                               out_features=100,
                               bias=True)
        self.dense2 = nn.Linear(100, out_features=10, bias=True)
        self.dense3 = nn.Linear(10, out_features=1, bias=False)

Ejemplo n.º 13

0

Mostrar archivo

Archivo: mv_lstm.py Proyecto: zhengd07/transformer-kernel-ranking

    def __init__(self, word_embeddings: TextFieldEmbedder, vocab: Vocabulary,
                 lstm_hidden_dim: int, top_k: int, cuda_device: int) -> None:
        super().__init__(vocab)

        self.word_embeddings = word_embeddings

        self.query_rep = nn.LSTM(self.word_embeddings.get_output_dim(),
                                 lstm_hidden_dim,
                                 batch_first=True,
                                 bidirectional=True)
        self.doc_rep = nn.LSTM(self.word_embeddings.get_output_dim(),
                               lstm_hidden_dim,
                               batch_first=True,
                               bidirectional=True)

        # this does not really do "attention" - just a plain cosine matrix calculation (without learnable weights)
        self.cosine_module = CosineMatrixAttention()

        self.top_k = top_k

        self.dense = nn.Linear(top_k, out_features=20, bias=True)
        self.dense2 = nn.Linear(20, out_features=20, bias=True)
        self.dense3 = nn.Linear(20, out_features=1, bias=False)

Ejemplo n.º 14

0

Mostrar archivo

    def __init__(self, vocab: Vocabulary,
                 char_embedder: TextFieldEmbedder,
                 word_embedder: TextFieldEmbedder,
                 tokens_encoder: Seq2SeqEncoder,
                 model_args,
                 inp_drop_rate: float = 0.5,
                 initializer: InitializerApplicator = InitializerApplicator(),
                 regularizer: Optional[RegularizerApplicator] = None) -> None:
        """
        :param vocab: vocabulary from train and dev dataset
        :param char_embedder: character embedding + cnn encoder
        :param word_embedder: word embedding
        :param tokens_encoder: Bi-LSTM backbone for split
        :param model_args: model arguments
        :param inp_drop_rate: input dropout rate
        """
        super(FollowUpSnippetModel, self).__init__(vocab, regularizer)

        self.tokens_encoder = tokens_encoder

        self.projection_layer = torch.nn.Linear(
            in_features=word_embedder.get_output_dim() + 1 + char_embedder.get_output_dim(),
            out_features=self.tokens_encoder.get_input_dim(),
            bias=False)

        # integer to mark field, 0 or 1
        self.num_classes = 2
        self.num_conflicts = 2

        self._non_linear = torch.nn.PReLU()

        self.hidden_size = int(self.tokens_encoder.get_output_dim() / 2)

        self.policy_net = PolicyNet(self.tokens_encoder.get_output_dim() * 3,
                                    self.num_classes)

        self.token_field_embedding = word_embedder
        self.char_field_embedding = char_embedder

        self._scaled_value = 1.0
        self._self_attention = CosineMatrixAttention()

        self.margin_loss = MarginRankingLoss(margin=model_args.margin)

        # calculate span similarity
        self.cosine_similar = CosineSimilarity(dim=0)

        if inp_drop_rate > 0:
            self._variational_dropout = InputVariationalDropout(p=inp_drop_rate)
        else:
            self._variational_dropout = lambda x: x

        self.metrics = {
            "bleu": BLEUScore(),
            "reward": RewardScore(),
            "symbol": SymbolScore(),
            "reward_var": RewardScore(),
            "overall": RewardScore()
        }

        initializer(self)

Ejemplo n.º 15

0

Mostrar archivo

Archivo: mv_lstm.py Proyecto: zhengd07/transformer-kernel-ranking

class MV_LSTM(Model):
    '''
    Paper: A Deep Architecture for Semantic Matching with Multiple Positional Sentence Representations, Wan et al., AAAI'16

    Reference code (paper author): https://github.com/NTMC-Community/MatchZoo/blob/master/matchzoo/models/mvlstm.py (but in tensorflow)

    '''
    def __init__(self, word_embeddings: TextFieldEmbedder, vocab: Vocabulary,
                 lstm_hidden_dim: int, top_k: int, cuda_device: int) -> None:
        super().__init__(vocab)

        self.word_embeddings = word_embeddings

        self.query_rep = nn.LSTM(self.word_embeddings.get_output_dim(),
                                 lstm_hidden_dim,
                                 batch_first=True,
                                 bidirectional=True)
        self.doc_rep = nn.LSTM(self.word_embeddings.get_output_dim(),
                               lstm_hidden_dim,
                               batch_first=True,
                               bidirectional=True)

        # this does not really do "attention" - just a plain cosine matrix calculation (without learnable weights)
        self.cosine_module = CosineMatrixAttention()

        self.top_k = top_k

        self.dense = nn.Linear(top_k, out_features=20, bias=True)
        self.dense2 = nn.Linear(20, out_features=20, bias=True)
        self.dense3 = nn.Linear(20, out_features=1, bias=False)

    def forward(self, query: Dict[str, torch.Tensor],
                document: Dict[str, torch.Tensor], query_length: torch.Tensor,
                document_length: torch.Tensor) -> torch.Tensor:
        # pylint: disable=arguments-differ

        #
        # prepare embedding tensors & paddings masks
        # -------------------------------------------------------

        # we assume 1 is the unknown token, 0 is padding - both need to be removed
        if len(query["tokens"].shape) == 2:  # (embedding lookup matrix)
            # shape: (batch, query_max)
            query_pad_oov_mask = (query["tokens"] > 1).float()
            # shape: (batch, doc_max)
            document_pad_oov_mask = (document["tokens"] > 1).float()
        else:  # == 3 (elmo characters per word)
            # shape: (batch, query_max)
            query_pad_oov_mask = (torch.sum(query["tokens"], 2) > 0).float()
            # shape: (batch, doc_max)
            document_pad_oov_mask = (torch.sum(document["tokens"], 2) >
                                     0).float()

        # shape: (batch, query_max,emb_dim)
        query_embeddings = self.word_embeddings(
            query) * query_pad_oov_mask.unsqueeze(-1)
        # shape: (batch, document_max,emb_dim)
        document_embeddings = self.word_embeddings(
            document) * document_pad_oov_mask.unsqueeze(-1)

        #
        # conextualized rep (via lstms)
        # -------------------------------------------------------

        #hidden_d = torch.randn(())

        query_rep, hidden_q = self.query_rep(query_embeddings)
        document_rep, hidden_d = self.doc_rep(document_embeddings)

        #
        # cosine matrix
        # -------------------------------------------------------

        # shape: (batch, query_max, doc_max)
        cosine_matrix = self.cosine_module.forward(query_rep, document_rep)

        #
        # topk pooling
        # -------------------------------------------------------

        cosine_flat = cosine_matrix.view(cosine_matrix.shape[0], -1)

        top_k_elments = torch.topk(cosine_flat, k=self.top_k, sorted=True)[0]

        ##
        ## "MLP" layer
        ## -------------------------------------------------------

        dense_out = F.relu(self.dense(top_k_elments))
        dense_out = F.relu(self.dense2(dense_out))
        dense_out = self.dense3(dense_out)

        output = torch.squeeze(dense_out, 1)
        return output

Ejemplo n.º 16

0

Mostrar archivo

    def __init__(self,
                 vocab: Vocabulary,
                 text_field_embedder: TextFieldEmbedder,
                 contextualizer: Seq2SeqEncoder,
                 labeler: Seq2SeqEncoder,
                 projection_size: int,
                 bidirectional: bool = False,
                 use_hypothesis: bool = True,
                 attention: str = "", # "" - none / cosine / bilinear
                 initializer: InitializerApplicator = None,
                 classifier_dir = "",
                 del_perc_lambda = 1,
                 del_perc = 0.3,
                 del_metric_threshold = 0.1,
                 teacher_lambda = 0.0,
                 coverage_lambda = 0.0,
                 transition_lamb = 0.0,
                 gumbel = True,
                 neutral_label = "") -> None:
        super().__init__(vocab)
        self._text_field_embedder = text_field_embedder

        if contextualizer.is_bidirectional() is not bidirectional:
            raise ConfigurationError(
                    "Bidirectionality of contextualizer must match bidirectionality of "
                    "language model. "
                    f"Contextualizer bidirectional: {contextualizer.is_bidirectional()}, "
                    f"language model bidirectional: {bidirectional}")

        self.classifier_dir = classifier_dir
        self.classifier = None
        self.coverage_lambda = coverage_lambda
        self.del_perc_lambda = del_perc_lambda
        self.del_perc = del_perc
        self.teacher_lambda = teacher_lambda
        self.transition_lamb = transition_lamb
        self.gumbel = gumbel
        if classifier_dir != "":
            overrides = '{"model": {"dropout": 0, "output_feedforward": {"dropout": 0}}}'
            overrides = ""
            archive = load_archive(classifier_dir, overrides=overrides)

            self.classifier = archive.model
            # Freeze parameters
            for p in self.classifier.parameters():
                p.requires_grad = False

            # A hack that prevents allennlp from crushing when running extend on all submodules
            def foo(*x, **y): return 1
            self.classifier._text_field_embedder.token_embedder_tokens.extend_vocab = foo
            self.classifier.eval()

            # get index of the neutral label
            self.neutral_ind = self.classifier.vocab.get_token_index(neutral_label, 'labels')

        self.criterion = torch.nn.CrossEntropyLoss()

        self._contextualizer = contextualizer
        self._labeler = labeler
        self._bidirectional = bidirectional
        self.use_hypothesis = use_hypothesis
        self.attention = attention
        self.projection_size = projection_size

        # hypothesis aggr
        self.w_prem = torch.nn.Linear(contextualizer.get_output_dim(), projection_size)
        if use_hypothesis:
            self.w_hyp = torch.nn.Linear(contextualizer.get_output_dim(), projection_size)

        self._contextual_dim = contextualizer.get_output_dim()
        # The dimension for making predictions just in the forward
        # (or backward) direction.
        if self._bidirectional:
            self._forward_dim = self._contextual_dim // 2
        else:
            self._forward_dim = self._contextual_dim

        if self.attention:
            if self.attention == "cosine":
                self.attention_mat = CosineMatrixAttention()
            elif self.attention == "bilinear":
                self.attention_mat = BilinearMatrixAttention(self._forward_dim, self._forward_dim)
            else:
                raise ConfigurationError("Undefined attention type")

        self.mask_linear = torch.nn.Linear(self._labeler.get_output_dim(), 2)

        self._accuracy = CategoricalAccuracy()
        self._avg_perc_masked = Average()
        self._avg_transition = Average()
        self._acc_vs_del = AccuracyVSDeletion(del_threshold=del_metric_threshold)
        self._acc_plus_del = AccuracyVSDeletion(del_threshold=0, aggr="sum")
        self._f1_deletions = F1SequenceMeasure(positive_label=1)
        if initializer is not None:
            initializer(self)

Ejemplo n.º 17

0

Mostrar archivo

Archivo: co_pacrr.py Proyecto: zhengd07/transformer-kernel-ranking

class CO_PACRR(nn.Module):
    '''
    Paper: Co-PACRR: A Context-Aware Neural IR Model for Ad-hoc Retrieval, Hui et al., WSDM'18

    Reference code (but in tensorflow):
    
    * first-hand: https://github.com/khui/copacrr/blob/master/models/pacrr.py
    
    differences to pacrr: 
      * (1) context vector (query avg, document rolling window avg pool)
      * (2) cascade k-max pooling
      * (3) shuffling query terms at the end
    '''
    @staticmethod
    def from_config(config, word_embeddings_out_dim):
        return CO_PACRR(
            unified_query_length=config["pacrr_unified_query_length"],
            unified_document_length=config["pacrr_unified_document_length"],
            max_conv_kernel_size=config["pacrr_max_conv_kernel_size"],
            conv_output_size=config["pacrr_conv_output_size"],
            kmax_pooling_size=config["pacrr_kmax_pooling_size"])

    def __init__(
        self,
        unified_query_length: int,
        unified_document_length: int,
        max_conv_kernel_size: int,  # 2 to n
        conv_output_size: int,  # conv output channels
        kmax_pooling_size: int):  # per query k-max pooling

        super(CO_PACRR, self).__init__()

        self.cosine_module = CosineMatrixAttention()

        self.unified_query_length = unified_query_length
        self.unified_document_length = unified_document_length

        self.convolutions = []
        for i in range(2, max_conv_kernel_size + 1):
            self.convolutions.append(
                nn.Sequential(
                    nn.ConstantPad2d(
                        (0, i - 1, 0, i - 1), 0
                    ),  # this outputs [batch,1,unified_query_length + i - 1 ,unified_document_length + i - 1]
                    nn.Conv2d(
                        kernel_size=i,
                        in_channels=1,
                        out_channels=conv_output_size
                    ),  # this outputs [batch,32,unified_query_length,unified_document_length]
                    nn.MaxPool3d(
                        kernel_size=(conv_output_size, 1, 1)
                    )  # this outputs [batch,1,unified_query_length,unified_document_length]
                ))
        self.convolutions = nn.ModuleList(
            self.convolutions)  # register conv as part of the model

        context_pool_size = 6
        self.doc_context_pool = nn.Sequential(
            nn.ConstantPad1d((0, context_pool_size - 1), 0),
            nn.AvgPool1d(kernel_size=context_pool_size, stride=1))

        self.masked_softmax = MaskedSoftmax()
        self.kmax_pooling_size = kmax_pooling_size

        kmax_pooling_view_percent = [0.25, 0.5, 0.75, 1]
        self.kmax_pooling_views = [
            int(unified_document_length * x) for x in kmax_pooling_view_percent
        ]

        self.dense = nn.Linear(len(self.kmax_pooling_views) * 2 *
                               kmax_pooling_size * unified_query_length *
                               max_conv_kernel_size,
                               out_features=100,
                               bias=True)
        self.dense2 = nn.Linear(100, out_features=10, bias=True)
        self.dense3 = nn.Linear(10, out_features=1, bias=False)

    def forward(self,
                query_embeddings: torch.Tensor,
                document_embeddings: torch.Tensor,
                query_pad_oov_mask: torch.Tensor,
                document_pad_oov_mask: torch.Tensor,
                query_idfs: torch.Tensor,
                document_idfs: torch.Tensor,
                output_secondary_output: bool = False) -> torch.Tensor:

        #
        # similarity matrix
        # -------------------------------------------------------

        # create sim matrix
        cosine_matrix = self.cosine_module.forward(query_embeddings,
                                                   document_embeddings)
        # shape: (batch, 1, query_max, doc_max) for the input of conv_2d
        cosine_matrix = cosine_matrix[:, None, :, :]

        #
        # generate query and doc contexts
        # -------------------------------------------------------

        query_context = torch.mean(query_embeddings, dim=1)
        document_context = self.doc_context_pool(
            document_embeddings.transpose(1, 2)).transpose(1, 2)

        cosine_matrix_context = self.cosine_module.forward(
            query_context.unsqueeze(dim=1), document_context).squeeze(1)

        #
        # duplicate cosine_matrix -> n-gram convolutions, then top-k pooling
        # ----------------------------------------------
        conv_results = []

        #
        # 1x1 cosine matrix (extra without convolutions)
        #

        cr_kmax_result = [[], []]

        for view_size in self.kmax_pooling_views:
            val, idx = torch.topk(cosine_matrix.squeeze(dim=1)[:, :,
                                                               0:view_size],
                                  k=self.kmax_pooling_size,
                                  sorted=True)
            cr_kmax_result[0].append(val)
            cr_kmax_result[1].append(idx)

        cr_kmax_result[0] = torch.cat(cr_kmax_result[0], dim=-1)
        cr_kmax_result[1] = torch.cat(cr_kmax_result[1], dim=-1)

        # incorporate context sims here, by selecting them from the kmax of the non-context sims
        flat_context = cosine_matrix_context.view(-1)
        index_offset = cr_kmax_result[1] + torch.arange(
            0,
            cr_kmax_result[1].shape[0] * cosine_matrix_context.shape[1],
            cosine_matrix_context.shape[1],
            device=cr_kmax_result[1].device).unsqueeze(-1).unsqueeze(-1)
        selected_context = flat_context.index_select(
            dim=0,
            index=index_offset.view(-1)).view(cr_kmax_result[1].shape[0],
                                              cr_kmax_result[1].shape[1], -1)
        conv_results.append(
            torch.cat([cr_kmax_result[0], selected_context], dim=2))

        #
        # nxn n-gram cosine matrices
        #
        for conv in self.convolutions:
            cr = conv(cosine_matrix)

            #
            # (2) take the kmax at multiple views of the cosine matrix - always starting
            #

            cr_kmax_result = [[], []]
            for view_size in self.kmax_pooling_views:
                val, idx = torch.topk(cr.squeeze(dim=1)[:, :, 0:view_size],
                                      k=self.kmax_pooling_size,
                                      sorted=True)
                cr_kmax_result[0].append(val)
                cr_kmax_result[1].append(idx)
            cr_kmax_result[0] = torch.cat(cr_kmax_result[0], dim=-1)
            cr_kmax_result[1] = torch.cat(cr_kmax_result[1], dim=-1)

            #
            # (1) incorporate context sims here, by selecting them from the kmax of the non-context sims
            #
            flat_context = cosine_matrix_context.view(-1)
            index_offset = cr_kmax_result[1] + torch.arange(
                0,
                cr_kmax_result[1].shape[0] * cosine_matrix_context.shape[1],
                cosine_matrix_context.shape[1],
                device=cr_kmax_result[1].device).unsqueeze(-1).unsqueeze(-1)
            selected_context = flat_context.index_select(
                dim=0,
                index=index_offset.view(-1)).view(cr_kmax_result[1].shape[0],
                                                  cr_kmax_result[1].shape[1],
                                                  -1)
            conv_results.append(
                torch.cat([cr_kmax_result[0], selected_context], dim=2))

        #
        # flatten all paths together & weight by query idf
        # -------------------------------------------------------

        per_query_results = torch.cat(conv_results, dim=-1)

        weighted_per_query = per_query_results * self.masked_softmax(
            query_idfs, query_pad_oov_mask.unsqueeze(-1))

        #
        # (3) shuffle component
        #
        if self.training:
            weighted_per_query = weighted_per_query[:,
                                                    torch.
                                                    randperm(weighted_per_query
                                                             .shape[1]), :]

        all_flat = per_query_results.view(weighted_per_query.shape[0], -1)

        #
        # dense layer
        # -------------------------------------------------------

        dense_out = F.relu(self.dense(all_flat))
        dense_out = F.relu(self.dense2(dense_out))
        dense_out = self.dense3(dense_out)

        output = torch.squeeze(dense_out, 1)
        if output_secondary_output:
            return output, {}
        return output

    def get_param_stats(self):
        return "CO-PACRR: / "

Ejemplo n.º 18

0

Mostrar archivo

Archivo: pacrr.py Proyecto: zhengd07/transformer-kernel-ranking

class PACRR(nn.Module):
    '''
    Paper: PACRR: A Position-Aware Neural IR Model for Relevance Matching, Hui et al., EMNLP'17

    Reference code (but in tensorflow):
    
    * first-hand: https://github.com/khui/copacrr/blob/master/models/pacrr.py
    
    '''

    @staticmethod
    def from_config(config,word_embeddings_out_dim):
        return PACRR(unified_query_length=config["pacrr_unified_query_length"], 
                     unified_document_length=config["pacrr_unified_document_length"],
                     max_conv_kernel_size=config["pacrr_max_conv_kernel_size"],
                     conv_output_size=config["pacrr_conv_output_size"],
                     kmax_pooling_size=config["pacrr_kmax_pooling_size"])

    def __init__(self,

                 unified_query_length:int,
                 unified_document_length:int,

                 max_conv_kernel_size: int, # 2 to n
                 conv_output_size: int, # conv output channels

                 kmax_pooling_size: int): # per query k-max pooling
                 
        super(PACRR,self).__init__()

        self.cosine_module = CosineMatrixAttention()

        self.unified_query_length = unified_query_length
        self.unified_document_length = unified_document_length

        self.convolutions = []
        for i in range(2, max_conv_kernel_size + 1):
            self.convolutions.append(
                nn.Sequential(
                    nn.ConstantPad2d((0,i - 1,0, i - 1), 0), # this outputs [batch,1,unified_query_length + i - 1 ,unified_document_length + i - 1]
                    nn.Conv2d(kernel_size=i, in_channels=1, out_channels=conv_output_size), # this outputs [batch,32,unified_query_length,unified_document_length]
                    nn.MaxPool3d(kernel_size=(conv_output_size,1,1)) # this outputs [batch,1,unified_query_length,unified_document_length]
            ))
        self.convolutions = nn.ModuleList(self.convolutions) # register conv as part of the model

        self.masked_softmax = MaskedSoftmax()
        self.kmax_pooling_size = kmax_pooling_size

        self.dense = nn.Linear(kmax_pooling_size * unified_query_length * max_conv_kernel_size, out_features=100, bias=True)
        self.dense2 = nn.Linear(100, out_features=10, bias=True)
        self.dense3 = nn.Linear(10, out_features=1, bias=False)

    def forward(self, query_embeddings: torch.Tensor, document_embeddings: torch.Tensor,
                query_pad_oov_mask: torch.Tensor, document_pad_oov_mask: torch.Tensor,
                query_idfs: torch.Tensor, document_idfs: torch.Tensor, 
                output_secondary_output: bool = False) -> torch.Tensor:

        #
        # similarity matrix
        # -------------------------------------------------------

        # create sim matrix
        cosine_matrix = self.cosine_module.forward(query_embeddings, document_embeddings)
        # shape: (batch, 1, query_max, doc_max) for the input of conv_2d
        cosine_matrix = cosine_matrix[:,None,:,:]

        #
        # duplicate cosine_matrix -> n-gram convolutions, then top-k pooling
        # ----------------------------------------------
        conv_results = []
        conv_results.append(torch.topk(cosine_matrix.squeeze(),k=self.kmax_pooling_size,sorted=True)[0])

        for conv in self.convolutions:
            cr = conv(cosine_matrix)
            cr_kmax_result = torch.topk(cr.squeeze(),k=self.kmax_pooling_size,sorted=True)[0]
            conv_results.append(cr_kmax_result)

        #
        # flatten all paths together & weight by query idf
        # -------------------------------------------------------
        
        per_query_results = torch.cat(conv_results,dim=-1)

        weigthed_per_query = per_query_results * self.masked_softmax(query_idfs, query_pad_oov_mask.unsqueeze(-1))

        all_flat = per_query_results.view(weigthed_per_query.shape[0],-1)


        #
        # dense layer
        # -------------------------------------------------------

        dense_out = F.relu(self.dense(all_flat))
        dense_out = F.relu(self.dense2(dense_out))
        dense_out = self.dense3(dense_out)

        output = torch.squeeze(dense_out, 1)
        return output

    def get_param_stats(self):
        return "PACRR: / "

Ejemplo n.º 19

0

Mostrar archivo

class MatchPyramid(nn.Module):
    '''
    Paper: Text Matching as Image Recognition, Pang et al., AAAI'16
    '''
    def __init__(
        self,
        #The embedding layer is specified as an AllenNLP TextFieldEmbedder.
        word_embeddings: TextFieldEmbedder,
        #the size of output channels
        conv_output_size: List[int],
        #the size of input channels
        conv_kernel_size: List[Tuple[int, int]],
        # the size of pooling layers to reduce the dimension of the feature maps
        adaptive_pooling_size: List[Tuple[int, int]]):

        super(MatchPyramid, self).__init__()

        self.word_embeddings = word_embeddings
        self.cosine_module = CosineMatrixAttention()

        if len(conv_output_size) != len(conv_kernel_size) or len(
                conv_output_size) != len(adaptive_pooling_size):
            raise Exception(
                "conv_output_size, conv_kernel_size, adaptive_pooling_size must have the same length"
            )

        #define the dictionary of convolution layers
        conv_layer_dict = OrderedDict()
        last_channel_out = 1
        for i in range(len(conv_output_size)):
            #pads the input tensor boundaries with a constant value
            #padding((padding_left, padding_right,padding_bottom),tuple)
            conv_layer_dict["pad " + str(i)] = nn.ConstantPad2d(
                (0, conv_kernel_size[i][0] - 1, 0, conv_kernel_size[i][1] - 1),
                0)
            #applies a 2D convolution
            conv_layer_dict["conv " + str(i)] = nn.Conv2d(
                kernel_size=conv_kernel_size[i],
                in_channels=last_channel_out,
                out_channels=conv_output_size[i])
            #applies a ReLU activation function
            conv_layer_dict["relu " + str(i)] = nn.ReLU()
            #applies a 2D adaptive max pooling
            conv_layer_dict["pool " + str(i)] = nn.AdaptiveMaxPool2d(
                adaptive_pooling_size[i])

            last_channel_out = conv_output_size[i]

        #add the layers to the model
        self.conv_layers = nn.Sequential(conv_layer_dict)

        ##adding FC layers
        self.dense = nn.Linear(conv_output_size[-1] *
                               adaptive_pooling_size[-1][0] *
                               adaptive_pooling_size[-1][1],
                               out_features=100,
                               bias=True)
        self.dense2 = nn.Linear(100, out_features=10, bias=True)
        self.dense3 = nn.Linear(10, out_features=1, bias=False)

        #initialize weights (values are taken from matchzoo)
        torch.nn.init.uniform_(self.dense.weight, -0.014, 0.014)

        #initialize biases
        self.dense.bias.data.fill_(0.0)

    def forward(self, query: Dict[str, torch.Tensor],
                document: Dict[str, torch.Tensor]) -> torch.Tensor:

        #
        # prepare embedding tensors
        # -------------------------------------------------------

        # shape: (batch, query_max)
        query_pad_oov_mask = (query["tokens"] > 0).float()
        # shape: (batch, doc_max)
        document_pad_oov_mask = (document["tokens"] > 0).float()

        # shape: (batch, query_max,emb_dim)
        query_embeddings = self.word_embeddings(
            query) * query_pad_oov_mask.unsqueeze(-1)
        # shape: (batch, document_max,emb_dim)
        document_embeddings = self.word_embeddings(
            document) * document_pad_oov_mask.unsqueeze(-1)

        #similarity matrix
        #shape: (batch, 1, query_max, doc_max) for the input of conv_2d
        cosine_matrix = self.cosine_module.forward(query_embeddings,
                                                   document_embeddings)
        cosine_matrix = cosine_matrix[:, None, :, :]

        #convolution
        #shape: (batch, conv_output_size, query_max, doc_max)
        conv_result = self.conv_layers(cosine_matrix)

        #dynamic pooling
        #flatten the output of dynamic pooling
        #shape: (batch, conv_output_size * pool_h * pool_w)
        conv_result_flat = conv_result.view(conv_result.size(0), -1)

        #
        # Learning to rank layer
        # -------------------------------------------------------
        dense_out = F.relu(self.dense(conv_result_flat))
        dense_out = F.relu(self.dense2(dense_out))
        dense_out = self.dense3(dense_out)
        output = torch.squeeze(dense_out, 1)
        return output

Ejemplo n.º 20

0

Mostrar archivo

Archivo: matchpyramid.py Proyecto: tuwien-information-retrieval/sigir19-neural-ir

class MatchPyramid(nn.Module):
    '''
    Paper: Text Matching as Image Recognition, Pang et al., AAAI'16

    Reference code (but in tensorflow):
    
    * first-hand: https://github.com/pl8787/MatchPyramid-TensorFlow/blob/master/model/model_mp.py
    
    * somewhat-third-hand reference: https://github.com/NTMC-Community/MatchZoo/blob/master/matchzoo/models/matchpyramid.py

    '''
    def __init__(self, word_embeddings: TextFieldEmbedder,
                 conv_output_size: List[int],
                 conv_kernel_size: List[Tuple[int, int]],
                 adaptive_pooling_size: List[Tuple[int, int]]):

        super(MatchPyramid, self).__init__()

        self.word_embeddings = word_embeddings
        self.cosine_module = CosineMatrixAttention()
        #self.cosine_module = DotProductMatrixAttention()

        if len(conv_output_size) != len(conv_kernel_size) or len(
                conv_output_size) != len(adaptive_pooling_size):
            raise Exception(
                "conv_output_size, conv_kernel_size, adaptive_pooling_size must have the same length"
            )

        conv_layer_dict = OrderedDict()
        last_channel_out = 1
        for i in range(len(conv_output_size)):
            conv_layer_dict["pad " + str(i)] = nn.ConstantPad2d(
                (0, conv_kernel_size[i][0] - 1, 0, conv_kernel_size[i][1] - 1),
                0)
            conv_layer_dict["conv " + str(i)] = nn.Conv2d(
                kernel_size=conv_kernel_size[i],
                in_channels=last_channel_out,
                out_channels=conv_output_size[i])
            conv_layer_dict["relu " + str(i)] = nn.ReLU()
            conv_layer_dict["pool " + str(i)] = nn.AdaptiveMaxPool2d(
                adaptive_pooling_size[i]
            )  # this is strange - but so written in the paper
            # would think only to pool at the end ??
            last_channel_out = conv_output_size[i]

        self.conv_layers = nn.Sequential(conv_layer_dict)

        #self.dropout = nn.Dropout(0)

        self.dense = nn.Linear(conv_output_size[-1] *
                               adaptive_pooling_size[-1][0] *
                               adaptive_pooling_size[-1][1],
                               out_features=100,
                               bias=True)
        self.dense2 = nn.Linear(100, out_features=10, bias=True)
        self.dense3 = nn.Linear(10, out_features=1, bias=False)

        # init with small weights, otherwise the dense output is way to high for the tanh -> resulting in loss == 1 all the time
        #torch.nn.init.uniform_(self.dense.weight, -0.014, 0.014)  # inits taken from matchzoo
        #self.dense.bias.data.fill_(0.0)

    def forward(self, query: Dict[str, torch.Tensor],
                document: Dict[str, torch.Tensor], query_length: torch.Tensor,
                document_length: torch.Tensor) -> torch.Tensor:
        # pylint: disable=arguments-differ

        #
        # prepare embedding tensors
        # -------------------------------------------------------

        # we assume 1 is the unknown token, 0 is padding - both need to be removed
        if len(query["tokens"].shape) == 2:  # (embedding lookup matrix)
            # shape: (batch, query_max)
            query_pad_oov_mask = (query["tokens"] > 1).float()
            # shape: (batch, doc_max)
            document_pad_oov_mask = (document["tokens"] > 1).float()
        else:  # == 3 (elmo characters per word)
            # shape: (batch, query_max)
            query_pad_oov_mask = (torch.sum(query["tokens"], 2) > 0).float()
            # shape: (batch, doc_max)
            document_pad_oov_mask = (torch.sum(document["tokens"], 2) >
                                     0).float()

        # shape: (batch, query_max,emb_dim)
        query_embeddings = self.word_embeddings(
            query) * query_pad_oov_mask.unsqueeze(-1)
        # shape: (batch, document_max,emb_dim)
        document_embeddings = self.word_embeddings(
            document) * document_pad_oov_mask.unsqueeze(-1)

        #
        # similarity matrix
        # -------------------------------------------------------

        cosine_matrix = self.cosine_module.forward(query_embeddings,
                                                   document_embeddings)
        # shape: (batch, 1, query_max, doc_max) for the input of conv_2d
        cosine_matrix = cosine_matrix[:, None, :, :]

        #
        # convolution
        # -------------------------------------------------------
        # shape: (batch, conv_output_size, query_max, doc_max)

        conv_result = self.conv_layers(cosine_matrix)

        #
        # dynamic pooling
        # -------------------------------------------------------

        # flatten the output of dynamic pooling

        # shape: (batch, conv_output_size * pool_h * pool_w)
        conv_result_flat = conv_result.view(conv_result.size(0), -1)

        #conv_result_flat = self.dropout(conv_result_flat)

        #
        # Learning to rank layer
        # -------------------------------------------------------
        dense_out = F.relu(self.dense(conv_result_flat))
        dense_out = F.relu(self.dense2(dense_out))
        dense_out = self.dense3(dense_out)
        #tanh_out = torch.tanh(dense_out)

        output = torch.squeeze(dense_out, 1)
        return output

Ejemplo n.º 21

0

Mostrar archivo

    def __init__(
        self,
        #The embedding layer is specified as an AllenNLP TextFieldEmbedder.
        word_embeddings: TextFieldEmbedder,
        #the size of output channels
        conv_output_size: List[int],
        #the size of input channels
        conv_kernel_size: List[Tuple[int, int]],
        # the size of pooling layers to reduce the dimension of the feature maps
        adaptive_pooling_size: List[Tuple[int, int]]):

        super(MatchPyramid, self).__init__()

        self.word_embeddings = word_embeddings
        self.cosine_module = CosineMatrixAttention()

        if len(conv_output_size) != len(conv_kernel_size) or len(
                conv_output_size) != len(adaptive_pooling_size):
            raise Exception(
                "conv_output_size, conv_kernel_size, adaptive_pooling_size must have the same length"
            )

        #define the dictionary of convolution layers
        conv_layer_dict = OrderedDict()
        last_channel_out = 1
        for i in range(len(conv_output_size)):
            #pads the input tensor boundaries with a constant value
            #padding((padding_left, padding_right,padding_bottom),tuple)
            conv_layer_dict["pad " + str(i)] = nn.ConstantPad2d(
                (0, conv_kernel_size[i][0] - 1, 0, conv_kernel_size[i][1] - 1),
                0)
            #applies a 2D convolution
            conv_layer_dict["conv " + str(i)] = nn.Conv2d(
                kernel_size=conv_kernel_size[i],
                in_channels=last_channel_out,
                out_channels=conv_output_size[i])
            #applies a ReLU activation function
            conv_layer_dict["relu " + str(i)] = nn.ReLU()
            #applies a 2D adaptive max pooling
            conv_layer_dict["pool " + str(i)] = nn.AdaptiveMaxPool2d(
                adaptive_pooling_size[i])

            last_channel_out = conv_output_size[i]

        #add the layers to the model
        self.conv_layers = nn.Sequential(conv_layer_dict)

        ##adding FC layers
        self.dense = nn.Linear(conv_output_size[-1] *
                               adaptive_pooling_size[-1][0] *
                               adaptive_pooling_size[-1][1],
                               out_features=100,
                               bias=True)
        self.dense2 = nn.Linear(100, out_features=10, bias=True)
        self.dense3 = nn.Linear(10, out_features=1, bias=False)

        #initialize weights (values are taken from matchzoo)
        torch.nn.init.uniform_(self.dense.weight, -0.014, 0.014)

        #initialize biases
        self.dense.bias.data.fill_(0.0)

Ejemplo n.º 22

0

Mostrar archivo

Archivo: sigir20_tkl.py Proyecto: xiaming9880/transformer-kernel-ranking

class TKL_sigir20(nn.Module):
    '''
    TKL is a neural IR model for long documents
    '''

    @staticmethod
    def from_config(config,word_embeddings_out_dim):
        return TKL_sigir20(word_embeddings_out_dim, 
                     kernels_mu =    config["tk_kernels_mu"],
                     kernels_sigma = config["tk_kernels_sigma"],
                     att_heads =     config["tk_att_heads"],
                     att_layer =     config["tk_att_layer"],
                     att_proj_dim =  config["tk_att_proj_dim"],
                     att_ff_dim =    config["tk_att_ff_dim"],
                     max_length =    config["max_doc_length"],
                     use_pos_encoding     = config["tk_use_pos_encoding"],
                     use_diff_posencoding = config["tk_use_diff_posencoding"],
                     saturation_type= config["tk_saturation_type"],
                     )

    def __init__(self,
                 _embsize:int,
                 kernels_mu: List[float],
                 kernels_sigma: List[float],
                 att_heads: int,
                 att_layer: int,
                 att_proj_dim: int,
                 att_ff_dim: int,
                 max_length,
                 use_pos_encoding,  
                 use_diff_posencoding,
                 saturation_type,
                 ):

        super(TKL_sigir20, self).__init__()

        n_kernels = len(kernels_mu)
        self.use_pos_encoding     = use_pos_encoding    
        self.use_diff_posencoding = use_diff_posencoding

        self.re_use_encoding = True

        self.chunk_size = 40
        self.overlap = 5
        self.extended_chunk_size = self.chunk_size + 2 * self.overlap
        
        self.sliding_window_size = 30
        self.top_k_chunks = 3

        self.use_idf_sat = saturation_type == "idf"
        self.use_embedding_sat = saturation_type == "embedding"
        self.use_linear_sat = saturation_type == "linear"
        self.use_log_sat = saturation_type == "log"

        if len(kernels_mu) != len(kernels_sigma):
            raise Exception("len(kernels_mu) != len(kernels_sigma)")

        # static - kernel size & magnitude variables
        self.mu = nn.Parameter(torch.cuda.FloatTensor(kernels_mu), requires_grad=False)#.view(1, 1, 1, n_kernels)
        self.sigma = nn.Parameter(torch.cuda.FloatTensor(kernels_sigma), requires_grad=False)#.view(1, 1, 1, n_kernels)
        #self.mu.data.requires_grad=True
        #self.sigma.data.requires_grad=True

        pos_f = self.get_positional_features(_embsize, 30) #max_timescale=100000
        pos_f.requires_grad = True
        self.positional_features_q = nn.Parameter(pos_f)
        self.positional_features_q.requires_grad = True

        if self.use_diff_posencoding == True:
            pos_f = self.get_positional_features(_embsize,2000+500+self.extended_chunk_size)[:,500:,:].clone() #max_timescale=100000
            pos_f.requires_grad = True
            self.positional_features_d = nn.Parameter(pos_f)
            self.positional_features_d.requires_grad = True
        else:
            self.positional_features_d = self.positional_features_q


        self.mixer = nn.Parameter(torch.full([1], 0.5, dtype=torch.float32, requires_grad=True))
        self.mixer_sat = nn.Parameter(torch.full([1], 0.5, dtype=torch.float32, requires_grad=True))

        #self.emb_reducer = nn.Linear(_embsize, 300, bias=True)

        encoder_layer = nn.TransformerEncoderLayer(_embsize, att_heads, dim_feedforward=att_ff_dim, dropout=0)
        self.contextualizer = nn.TransformerEncoder(encoder_layer, att_layer, norm=None)

        # this does not really do "attention" - just a plain cosine matrix calculation (without learnable weights) 
        self.cosine_module = CosineMatrixAttention()

        self.saturation_linear = nn.Linear(2, 1, bias=True)
        torch.nn.init.constant_(self.saturation_linear.bias, 100)
        torch.nn.init.uniform_(self.saturation_linear.weight, -0.014, 0.014)

        self.saturation_linear2 = nn.Linear(2, 1, bias=True)
        torch.nn.init.constant_(self.saturation_linear2.bias, 100)
        torch.nn.init.uniform_(self.saturation_linear2.weight, -0.014, 0.014)

        self.saturation_linear3 = nn.Linear(2, 1, bias=True)
        torch.nn.init.constant_(self.saturation_linear3.bias, 100)
        torch.nn.init.uniform_(self.saturation_linear3.weight, -0.014, 0.014)
        

        self.sat_normer = nn.LayerNorm(2,elementwise_affine=True)
        #self.sat_emb_reduce1 = nn.Linear(_embsize,_embsize, bias=False)
        self.sat_emb_reduce1 = nn.Linear(_embsize, 1, bias=False)
        #torch.nn.init.constant_(self.sat_emb_reduce1.bias, 2)

        self.kernel_mult = nn.Parameter(torch.full([4,1,1,1,n_kernels], 1, dtype=torch.float32, requires_grad=True))
        #self.length_normer = nn.Parameter(torch.full([1,1,1,1], 30, dtype=torch.float32, requires_grad=True))


        #self.max_chunks = int(max_length / self.chunk_size + 1)

        self.chunk_scoring = nn.Parameter(torch.full([1,self.top_k_chunks*5], 1, dtype=torch.float32, requires_grad=True))
        self.mixer_end = nn.Parameter(torch.full([1], 0.5, dtype=torch.float32, requires_grad=True))

        self.dense = nn.Linear(n_kernels, 1, bias=False)
        torch.nn.init.uniform_(self.dense.weight, -0.014, 0.014) # inits taken from matchzoo

    def forward(self, query_embeddings: torch.Tensor, document_embeddings: torch.Tensor,
                query_pad_oov_mask: torch.Tensor, document_pad_oov_mask: torch.Tensor,
                query_idfs: torch.Tensor, document_idfs: torch.Tensor, 
                output_secondary_output: bool = False) -> torch.Tensor:
        # pylint: disable=arguments-differ

        #
        # contextualization
        # -------------------------------------------------------

        query_embeddings_original = query_embeddings
        query_embeddings, query_embeddings_tf_output = self.forward_representation(query_embeddings, query_pad_oov_mask, self.positional_features_q[:,:query_embeddings.shape[1],:])


        if document_pad_oov_mask.shape[1] > self.overlap:
            needed_padding = self.extended_chunk_size - ((document_pad_oov_mask.shape[1] - self.overlap) % self.chunk_size)
        else:
            needed_padding = self.extended_chunk_size - self.overlap - document_pad_oov_mask.shape[1]

        document_embeddings = nn.functional.pad(document_embeddings,(0,0,self.overlap, needed_padding))
        document_pad_oov_mask = nn.functional.pad(document_pad_oov_mask,(self.overlap, needed_padding))

        chunked_docs = document_embeddings.unfold(1,self.extended_chunk_size,self.chunk_size).transpose(-1,-2)
        chunked_pad = document_pad_oov_mask.unfold(1,self.extended_chunk_size,self.chunk_size)
        
        batch_size = chunked_docs.shape[0]
        chunk_pieces = chunked_docs.shape[1]

        chunked_docs2=chunked_docs.reshape(-1,self.extended_chunk_size,document_embeddings.shape[-1])
        chunked_pad2=chunked_pad.reshape(-1,self.extended_chunk_size)

        packed_indices = chunked_pad2[:,self.overlap:-self.overlap].sum(-1) != 0

        documents_packed = chunked_docs2[packed_indices]
        padding_packed = chunked_pad2[packed_indices]

        if self.re_use_encoding:
            document_pos_encoding = self.positional_features_d[:,:documents_packed.shape[1],:]
        else:
            document_pos_encoding = self.positional_features_d[:,:document_embeddings.shape[1],:]
            document_pos_encoding = document_pos_encoding.unfold(1,self.extended_chunk_size,self.chunk_size).transpose(-1,-2)
            document_pos_encoding = document_pos_encoding.squeeze(0)
            document_pos_encoding = document_pos_encoding.repeat(document_embeddings.shape[0],1,1)[packed_indices]

        documents_packed,_ = self.forward_representation(documents_packed, padding_packed, document_pos_encoding)

        documents_unique_again = documents_packed[:,self.overlap:-self.overlap,:]
        document_mask_packed_unique = padding_packed[:,self.overlap:-self.overlap]

        #
        # cosine matrix
        # -------------------------------------------------------
        packed_query_embeddings = query_embeddings.unsqueeze(1).expand(-1,chunk_pieces,-1,-1).reshape(-1,query_embeddings.shape[1],query_embeddings.shape[-1])[packed_indices]
        packed_query_mask = query_pad_oov_mask.unsqueeze(1).expand(-1,chunk_pieces,-1).reshape(-1,query_embeddings.shape[1])[packed_indices]

        # shape: (batch, query_max, doc_max)
        cosine_matrix = self.cosine_module.forward(packed_query_embeddings, documents_unique_again)

        #
        # gaussian kernels & soft-TF
        #
        # first run through kernel, then sum on doc dim then sum on query dim
        # -------------------------------------------------------

        cosine_matrix_extradim = cosine_matrix.unsqueeze(-1)        
        raw_kernel_results = torch.exp(- torch.pow(cosine_matrix_extradim - self.mu.view(1, 1, 1, -1), 2) / (2 * torch.pow(self.sigma.view(1, 1, 1, -1), 2)))
        kernel_results_masked = raw_kernel_results * document_mask_packed_unique.unsqueeze(1).unsqueeze(-1)

        kerne_activations_per_doc = torch.zeros((chunked_docs2.shape[0],query_embeddings.shape[1],documents_unique_again.shape[1],kernel_results_masked.shape[-1]), dtype=chunked_docs2.dtype, layout=chunked_docs2.layout, device=chunked_docs2.device)
        kerne_activations_per_doc[packed_indices] = kernel_results_masked

        kerne_activations_per_doc = kerne_activations_per_doc.transpose(1,2).reshape(batch_size,-1,query_embeddings.shape[1],kernel_results_masked.shape[-1]).transpose(2,1)


        #
        # kernel-pooling
        # -------------------------------------------------------

        if kerne_activations_per_doc.shape[2] < self.sliding_window_size:
            kerne_activations_per_doc = nn.functional.pad(kerne_activations_per_doc,(0,0,0, self.sliding_window_size - kerne_activations_per_doc.shape[2]))

        unrolled_kernel_activations = kerne_activations_per_doc.unfold(2,self.sliding_window_size,2).transpose(-1,-2)
        unrolled_kernel_activation_lengths = torch.sum(unrolled_kernel_activations.sum(dim=-1) != 0,dim=-1)
        per_kernel_query = torch.sum(unrolled_kernel_activations, -2) 


        if self.use_idf_sat:
            sat_influencer = torch.cat([torch.relu(query_idfs.expand_as(unrolled_kernel_activation_lengths).unsqueeze(-1)),
                                        unrolled_kernel_activation_lengths.float().unsqueeze(-1)],dim=-1)

            sat1 = self.saturation_linear(sat_influencer)
            sat2 = 1 / self.saturation_linear2(sat_influencer)
            sat3 = self.saturation_linear3(sat_influencer)

            sat_per_kernel_query = sat1 * (torch.clamp(per_kernel_query, min=1e-10) ** sat2) - sat3

        elif self.use_embedding_sat:
            sat_influencer = torch.cat([self.sat_emb_reduce1(query_embeddings).expand_as(unrolled_kernel_activation_lengths).unsqueeze(-1),
                                        unrolled_kernel_activation_lengths.float().unsqueeze(-1)],dim=-1)

            sat_influencer = self.sat_normer(sat_influencer)

            sat1 = self.saturation_linear(sat_influencer)
            sat2 = 1 / self.saturation_linear2(sat_influencer)
            sat3 = self.saturation_linear3(sat_influencer)

            sat_per_kernel_query = sat1 * (torch.clamp(per_kernel_query, min=1e-10) ** sat2) - sat3

        elif self.use_linear_sat:
            sat_influencer = torch.cat([torch.relu(query_idfs.expand_as(unrolled_kernel_activation_lengths).unsqueeze(-1)),
                                        unrolled_kernel_activation_lengths.float().unsqueeze(-1)],dim=-1)

            sat1 = self.saturation_linear(sat_influencer)
            sat2 = self.saturation_linear2(sat_influencer)

            sat_per_kernel_query = sat1 * torch.clamp(per_kernel_query, min=1e-10) + sat2

        elif self.use_log_sat:
            sat_per_kernel_query = torch.log(torch.clamp(per_kernel_query * self.kernel_mult[0], min=1e-10))

        sat_per_kernel_query = sat_per_kernel_query * query_pad_oov_mask.unsqueeze(-1).unsqueeze(-1) * (unrolled_kernel_activation_lengths > 0).float().unsqueeze(-1) # make sure we mask out padding values
        per_kernel = torch.sum(sat_per_kernel_query, 1) 

        dense_out = self.dense(per_kernel)
        score = dense_out.squeeze(-1)

        if score.shape[1] < self.top_k_chunks:
            score = nn.functional.pad(score,(0, self.top_k_chunks - score.shape[1]))

        score[score == 0] = -9900
        orig_score = score

        #
        # argmax top-n hills
        # 
        top_non_overlapping_idx = torch.zeros((orig_score.shape[0],self.top_k_chunks), dtype=torch.long, device=orig_score.device) 
        max_per_region_score = orig_score.clone()

        r = torch.arange(max_per_region_score.shape[1],device=max_per_region_score.device)

        for c in range(0,self.top_k_chunks):
           
            best_index = torch.argmax(max_per_region_score,dim=1)
            top_non_overlapping_idx[:,c] = best_index
            region_pool = torch.abs(r - best_index.unsqueeze(-1)) < self.sliding_window_size / 2
            max_per_region_score[region_pool] = -10001 - c

       
        top_non_overlapping_idx_neighbors = torch.cat([top_non_overlapping_idx,top_non_overlapping_idx - 1,top_non_overlapping_idx + 1,top_non_overlapping_idx - 2,top_non_overlapping_idx + 2],dim=1)
        top_non_overlapping_idx_neighbors[top_non_overlapping_idx_neighbors < 0] = 0
        top_non_overlapping_idx_neighbors[top_non_overlapping_idx_neighbors >= orig_score.shape[1]] = orig_score.shape[1] - 1

        topk_indices_flat = (top_non_overlapping_idx_neighbors + torch.arange(0,orig_score.shape[0]*orig_score.shape[1],orig_score.shape[1],device=orig_score.device).unsqueeze(-1)).view(-1)
        top_k_non_overlapping = orig_score.view(-1).index_select(0,topk_indices_flat).view(top_non_overlapping_idx.shape[0],-1)
        top_k_non_overlapping[top_k_non_overlapping <= -9900] = 0

        orig_score[orig_score <= -9900] = 0

        score = (top_k_non_overlapping * self.chunk_scoring).sum(dim=1)

        if output_secondary_output:
            query_mean_vector = query_embeddings.sum(dim=1) / query_pad_oov_mask.sum(dim=1).unsqueeze(-1)
            sat_influence_from_top_k = sat_influencer.transpose(1,2).reshape(-1,query_embeddings.shape[1],2).index_select(0,topk_indices_flat).view(top_non_overlapping_idx_neighbors.shape[0],top_non_overlapping_idx_neighbors.shape[1],query_embeddings.shape[1],2)
            return score, {"score":score,"orig_score":orig_score,"top_non_overlapping_idx":top_non_overlapping_idx,"orig_doc_len":document_pad_oov_mask.sum(dim=-1),"top_k_non_overlapping":top_k_non_overlapping,"sat_influence_from_top_k":sat_influence_from_top_k,
                           "total_chunks":chunked_docs2.shape[0],"packed_chunks":documents_packed.shape[0]}
        else:
            return score

    def forward_representation(self, sequence_embeddings: torch.Tensor, sequence_mask: torch.Tensor, positional_features=None) -> torch.Tensor:

        pos_sequence = sequence_embeddings
        if self.use_pos_encoding:
            if positional_features is None:
                positional_features = self.positional_features_d[:,:sequence_embeddings.shape[1],:]
            pos_sequence = sequence_embeddings + positional_features
        
        sequence_embeddings_context = self.contextualizer((pos_sequence).transpose(1,0),src_key_padding_mask=~sequence_mask.bool()).transpose(1,0)
        
        sequence_embeddings = (self.mixer * sequence_embeddings + (1 - self.mixer) * sequence_embeddings_context) * sequence_mask.unsqueeze(-1)

        return sequence_embeddings,sequence_embeddings_context

    def get_positional_features(self,dimensions,
                                max_length,
                                min_timescale: float = 1.0,
                                max_timescale: float = 1.0e4):
        # pylint: disable=line-too-long
        """
        Implements the frequency-based positional encoding described
        in `Attention is all you Need
        <https://www.semanticscholar.org/paper/Attention-Is-All-You-Need-Vaswani-Shazeer/0737da0767d77606169cbf4187b83e1ab62f6077>`_ .

        Adds sinusoids of different frequencies to a ``Tensor``. A sinusoid of a
        different frequency and phase is added to each dimension of the input ``Tensor``.
        This allows the attention heads to use absolute and relative positions.

        The number of timescales is equal to hidden_dim / 2 within the range
        (min_timescale, max_timescale). For each timescale, the two sinusoidal
        signals sin(timestep / timescale) and cos(timestep / timescale) are
        generated and concatenated along the hidden_dim dimension.

        Parameters
        ----------
        tensor : ``torch.Tensor``
            a Tensor with shape (batch_size, timesteps, hidden_dim).
        min_timescale : ``float``, optional (default = 1.0)
            The smallest timescale to use.
        max_timescale : ``float``, optional (default = 1.0e4)
            The largest timescale to use.

        Returns
        -------
        The input tensor augmented with the sinusoidal frequencies.
        """
        timesteps=max_length
        hidden_dim = dimensions

        timestep_range = self.get_range_vector(timesteps, 0).data.float()
        # We're generating both cos and sin frequencies,
        # so half for each.
        num_timescales = hidden_dim // 2
        timescale_range = self.get_range_vector(num_timescales, 0).data.float()

        log_timescale_increments = math.log(float(max_timescale) / float(min_timescale)) / float(num_timescales - 1)
        inverse_timescales = min_timescale * torch.exp(timescale_range * -log_timescale_increments)

        # Broadcasted multiplication - shape (timesteps, num_timescales)
        scaled_time = timestep_range.unsqueeze(1) * inverse_timescales.unsqueeze(0)
        # shape (timesteps, 2 * num_timescales)
        sinusoids = torch.cat([torch.sin(scaled_time), torch.cos(scaled_time)], 1)
        if hidden_dim % 2 != 0:
            # if the number of dimensions is odd, the cos and sin
            # timescales had size (hidden_dim - 1) / 2, so we need
            # to add a row of zeros to make up the difference.
            sinusoids = torch.cat([sinusoids, sinusoids.new_zeros(timesteps, 1)], 1)
        return sinusoids.unsqueeze(0)

    def get_range_vector(self, size: int, device: int) -> torch.Tensor:
        """
        Returns a range vector with the desired size, starting at 0. The CUDA implementation
        is meant to avoid copy data from CPU to GPU.
        """
        if device > -1:
            return torch.cuda.LongTensor(size, device=device).fill_(1).cumsum(0) - 1
        else:
            return torch.arange(0, size, dtype=torch.long)

    def get_param_stats(self): #" b: "+str(self.dense.bias.data) +\ "b: "+str(self.dense_mean.bias.data) +#"scaler: "+str(self.nn_scaler.data) +\ # " bias: " +str(self.saturation_linear.bias.data) +\
        return "TK: dense w: "+str(self.dense.weight.data) +\
        " self.chunk_scoring: " +str(self.chunk_scoring.data) +\
        " self.kernel_mult: " +str(self.kernel_mult.data) +\
        " self.saturation_linear: " +str(self.saturation_linear.weight.data) + " bias: " +str(self.saturation_linear.bias.data) +\
        " self.saturation_linear2: " +str(self.saturation_linear2.weight.data) + " bias: " +str(self.saturation_linear2.bias.data) +\
        " self.saturation_linear3: " +str(self.saturation_linear3.weight.data) + " bias: " +str(self.saturation_linear3.bias.data) +\
        "mixer: "+str(self.mixer.data) #+ "mixer_end: "+str(self.mixer_end.data)

    def get_param_secondary(self):
        return {"dense_weight":self.dense.weight,
                "saturation_linear_weight":self.saturation_linear.weight,
                "saturation_linear_bias":self.saturation_linear.bias,
                "saturation_linear2_weight":self.saturation_linear2.weight,
                "saturation_linear2_bias":self.saturation_linear2.bias,
                "saturation_linear3_weight":self.saturation_linear3.weight,
                "saturation_linear3_bias":self.saturation_linear3.bias,
                "chunk_scoring":self.chunk_scoring,
                "kernel_mult":self.kernel_mult,
                "mixer":self.mixer}

Ejemplo n.º 23

0

Mostrar archivo

Archivo: sigir20_tkl.py Proyecto: xiaming9880/transformer-kernel-ranking

    def __init__(self,
                 _embsize:int,
                 kernels_mu: List[float],
                 kernels_sigma: List[float],
                 att_heads: int,
                 att_layer: int,
                 att_proj_dim: int,
                 att_ff_dim: int,
                 max_length,
                 use_pos_encoding,  
                 use_diff_posencoding,
                 saturation_type,
                 ):

        super(TKL_sigir20, self).__init__()

        n_kernels = len(kernels_mu)
        self.use_pos_encoding     = use_pos_encoding    
        self.use_diff_posencoding = use_diff_posencoding

        self.re_use_encoding = True

        self.chunk_size = 40
        self.overlap = 5
        self.extended_chunk_size = self.chunk_size + 2 * self.overlap
        
        self.sliding_window_size = 30
        self.top_k_chunks = 3

        self.use_idf_sat = saturation_type == "idf"
        self.use_embedding_sat = saturation_type == "embedding"
        self.use_linear_sat = saturation_type == "linear"
        self.use_log_sat = saturation_type == "log"

        if len(kernels_mu) != len(kernels_sigma):
            raise Exception("len(kernels_mu) != len(kernels_sigma)")

        # static - kernel size & magnitude variables
        self.mu = nn.Parameter(torch.cuda.FloatTensor(kernels_mu), requires_grad=False)#.view(1, 1, 1, n_kernels)
        self.sigma = nn.Parameter(torch.cuda.FloatTensor(kernels_sigma), requires_grad=False)#.view(1, 1, 1, n_kernels)
        #self.mu.data.requires_grad=True
        #self.sigma.data.requires_grad=True

        pos_f = self.get_positional_features(_embsize, 30) #max_timescale=100000
        pos_f.requires_grad = True
        self.positional_features_q = nn.Parameter(pos_f)
        self.positional_features_q.requires_grad = True

        if self.use_diff_posencoding == True:
            pos_f = self.get_positional_features(_embsize,2000+500+self.extended_chunk_size)[:,500:,:].clone() #max_timescale=100000
            pos_f.requires_grad = True
            self.positional_features_d = nn.Parameter(pos_f)
            self.positional_features_d.requires_grad = True
        else:
            self.positional_features_d = self.positional_features_q


        self.mixer = nn.Parameter(torch.full([1], 0.5, dtype=torch.float32, requires_grad=True))
        self.mixer_sat = nn.Parameter(torch.full([1], 0.5, dtype=torch.float32, requires_grad=True))

        #self.emb_reducer = nn.Linear(_embsize, 300, bias=True)

        encoder_layer = nn.TransformerEncoderLayer(_embsize, att_heads, dim_feedforward=att_ff_dim, dropout=0)
        self.contextualizer = nn.TransformerEncoder(encoder_layer, att_layer, norm=None)

        # this does not really do "attention" - just a plain cosine matrix calculation (without learnable weights) 
        self.cosine_module = CosineMatrixAttention()

        self.saturation_linear = nn.Linear(2, 1, bias=True)
        torch.nn.init.constant_(self.saturation_linear.bias, 100)
        torch.nn.init.uniform_(self.saturation_linear.weight, -0.014, 0.014)

        self.saturation_linear2 = nn.Linear(2, 1, bias=True)
        torch.nn.init.constant_(self.saturation_linear2.bias, 100)
        torch.nn.init.uniform_(self.saturation_linear2.weight, -0.014, 0.014)

        self.saturation_linear3 = nn.Linear(2, 1, bias=True)
        torch.nn.init.constant_(self.saturation_linear3.bias, 100)
        torch.nn.init.uniform_(self.saturation_linear3.weight, -0.014, 0.014)
        

        self.sat_normer = nn.LayerNorm(2,elementwise_affine=True)
        #self.sat_emb_reduce1 = nn.Linear(_embsize,_embsize, bias=False)
        self.sat_emb_reduce1 = nn.Linear(_embsize, 1, bias=False)
        #torch.nn.init.constant_(self.sat_emb_reduce1.bias, 2)

        self.kernel_mult = nn.Parameter(torch.full([4,1,1,1,n_kernels], 1, dtype=torch.float32, requires_grad=True))
        #self.length_normer = nn.Parameter(torch.full([1,1,1,1], 30, dtype=torch.float32, requires_grad=True))


        #self.max_chunks = int(max_length / self.chunk_size + 1)

        self.chunk_scoring = nn.Parameter(torch.full([1,self.top_k_chunks*5], 1, dtype=torch.float32, requires_grad=True))
        self.mixer_end = nn.Parameter(torch.full([1], 0.5, dtype=torch.float32, requires_grad=True))

        self.dense = nn.Linear(n_kernels, 1, bias=False)
        torch.nn.init.uniform_(self.dense.weight, -0.014, 0.014) # inits taken from matchzoo

Ejemplo n.º 24

0

Mostrar archivo

Archivo: model_conv_knrm.py Proyecto: bgeszti/ML

class Conv_KNRM(nn.Module):
    '''
    Paper: Convolutional Neural Networks for Soſt-Matching N-Grams in Ad-hoc Search, Dai et al. WSDM 18
    '''

    def __init__(self,
                 word_embeddings: TextFieldEmbedder,
                 n_grams: int,
                 n_kernels: int,
                 conv_out_dim: int):

        super(Conv_KNRM, self).__init__()

        self.word_embeddings = word_embeddings

        # static - kernel size & magnitude variables
        self.mu = Variable(torch.FloatTensor(self.kernel_mus(n_kernels)), requires_grad = False).view(1, 1, 1, n_kernels)
        self.sigma = Variable(torch.FloatTensor(self.kernel_sigmas(n_kernels)), requires_grad = False).view(1, 1, 1,
                                                                                                          n_kernels)

        # Implement 1 Dimensional CNN layer for each n-gram type
        # Also, use RelU as Activation function
        self.convolutions = []
        for i in range (1, n_grams + 1):
            self.convolutions.append(nn.Sequential(
            nn.ConstantPad1d((0 , i-1 ), 0),
            # the kernel size of the convolutional layer is the same as the current i-gram(uni, bi, tri...) in the loop
            nn.Conv1d(kernel_size = i, in_channels = word_embeddings.get_output_dim(), out_channels = conv_out_dim),
            nn.ReLU()))
            # register conv as part of the model
        self.convolutions = nn.ModuleList(self.convolutions)

        #Cosine similarity matrix
        self.cosine_module = CosineMatrixAttention()


        # Initialize the Linear transformer model:
        # size of the input: number of elements in the soft-TF feautes * number of kernel products (
        # n_kernels *  n_grams * n_grams = all combination of match matrix creation
        # (n-gram pairs from query and document embeddings)
        # the output will be 1 sample
        # also use bias based on the paper formula (by default it's true but just to make sure)
        self.transform = nn.Linear(in_features = n_kernels * n_grams * n_grams, out_features = 1, bias = True)

    def forward(self, query: Dict[str, torch.Tensor], document: Dict[str, torch.Tensor]) -> torch.Tensor:

        #
        # prepare embedding tensorsss
        # -------------------------------------------------------

        # we assume 0 is padding - both need to be removed
        # shape: (batch, query_max)
        #
        query_pad_mask = (query["tokens"] > 0).float()  # > 1 to also mask oov terms
        document_pad_mask = (document["tokens"] > 0).float()

        maskedEmbed = getMaskedEmbed(query_pad_mask, document_pad_mask)
        maskedEmbed = (maskedEmbed.unsqueeze(-1)).cuda()

        #Before the conv
        queryEmbeddings = (self.word_embeddings(query)).cuda()

        documentEmbeddings = (self.word_embeddings(document)).cuda()

        # Transpose the embeddings make it applicible to the convolution layer
        # after the conv feed an relu-layer, it will be transposed back

        query_embeddings_t = queryEmbeddings.transpose(1, 2)
        document_embeddings_t = documentEmbeddings.transpose(1, 2)


        #Initialize list to store each convolutioned n-gram document and query embeddings
        # Do we have to pre-define the sizes of list? can it make the process faster?
        convQueries = []
        convDocs = []

        #Loop through all  n-gram convolution ty
        for conv in self.convolutions:
            # get the embeddings through the layers, and store them in the list in the original row-column format
            convQueries.append(conv(query_embeddings_t).transpose(1, 2))
            convDocs.append(conv(document_embeddings_t).transpose(1, 2))


        #Place sigma and mu into the gpu
        mu = self.mu
        mu = mu.cuda()

        sigma = self.sigma
        sigma = sigma.cuda()

        #Now we have the convolutiend n-gram embeddings for document and queries
        # Next step:
        # For each n-gram combination: create a match matrix: combine each n-gram document and word embeddings:
        # It will provide n*n match matrix
        #Concept: loop through each convolutioned document embedding and calculate the cosine similarity
        # then we have the cosine similarity, apply kernel pooling (where the padding will be masked),
        # then store the results in a list called kernelresult (or softTFFeatures?)
        softTFFeatures = []
        #Initialize the document embedding loop
        for d in convDocs:
            #initialize the inner loop which will provide to loop through all query embeds
            for q in convQueries:
                # Calculate cosine similarity
                matchMatrix = self.cosine_module.forward(q, d)


                #Add a new dimension to resolve mismatch
                matchMatrix = matchMatrix.unsqueeze(-1).cuda()

                # Calculate kernel pooling on the match matrix, input parameters: match matrix and the mask - matrix
                kernelResult = calculateKernel(matchMatrix, maskedEmbed, query_pad_mask, mu = mu, sigma = sigma)
                # the results are the soft-tf features provided by the d-gram document with the q-gram query cosine similarity
                #Store the features in the list
                softTFFeatures.append(kernelResult)


        # Concatenate kernel pooling results/soft-tf features: basicallly it creates a new matrix,
        # where each row is a soft-tf feature (so our list can be considered now as Sequence of tensors),
        # which will be concatenated row-wise?
        pooling_sum = torch.cat(softTFFeatures, 1).cuda()

        # Then Linear transformation will be applied on the matrix
        # The learning - to - rank(LeToR) layer combines the soft-TF ranking features into a ranking score:
        # Steps:
        # apply linear transformation on the concatenated matrixes,
        # calculate hyperbolic tangent on it
        # Create Final Scoring, also Remove the 2nd tensor dimension if it's size is 1
        output = torch.squeeze(torch.tanh(self.transform(pooling_sum)), 1).cuda()
        return output

    def kernel_mus(self, n_kernels: int):
        """
        get the mu for each guassian kernel. Mu is the middle of each bin
        :param n_kernels: number of kernels (including exact match). first one is exact match
        :return: l_mu, a list of mu.
        """
        l_mu = [1.0]
        if n_kernels == 1:
            return l_mu

        bin_size = 2.0 / (n_kernels - 1)  # score range from [-1, 1]
        l_mu.append(1 - bin_size / 2)  # mu: middle of the bin
        for i in range(1, n_kernels - 1):
            l_mu.append(l_mu[i] - bin_size)
        return l_mu

    def kernel_sigmas(self, n_kernels: int):
        """
        get sigmas for each guassian kernel.
        :param n_kernels: number of kernels (including exactmath.)
        :param lamb:
        :param use_exact:
        :return: l_sigma, a list of simga
        """
        bin_size = 2.0 / (n_kernels - 1)
        l_sigma = [0.001]  # for exact match. small variance -> exact match
        if n_kernels == 1:
            return l_sigma

        l_sigma += [0.5 * bin_size] * (n_kernels - 1)
        return l_sigma

Ejemplo n.º 25

0

Mostrar archivo

Archivo: tk.py Proyecto: zhengd07/transformer-kernel-ranking

class TK_v1(nn.Module):
    '''
    TK is a neural IR model - a fusion between transformer contextualization & kernel-based scoring

    -> uses 1 transformer block to contextualize embeddings
    -> soft-histogram kernels to score interactions

    '''
    @staticmethod
    def from_config(config, word_embeddings_out_dim):
        return TK_v1(word_embeddings_out_dim,
                     kernels_mu=config["tk_kernels_mu"],
                     kernels_sigma=config["tk_kernels_sigma"],
                     att_heads=config["tk_att_heads"],
                     att_layer=config["tk_att_layer"],
                     att_proj_dim=config["tk_att_proj_dim"],
                     att_ff_dim=config["tk_att_ff_dim"])

    def __init__(self, _embsize: int, kernels_mu: List[float],
                 kernels_sigma: List[float], att_heads: int, att_layer: int,
                 att_proj_dim: int, att_ff_dim: int):

        super(TK_v1, self).__init__()

        n_kernels = len(kernels_mu)

        if len(kernels_mu) != len(kernels_sigma):
            raise Exception("len(kernels_mu) != len(kernels_sigma)")

        # static - kernel size & magnitude variables
        self.mu = Variable(torch.cuda.FloatTensor(kernels_mu),
                           requires_grad=False).view(1, 1, 1, n_kernels)
        self.sigma = Variable(torch.cuda.FloatTensor(kernels_sigma),
                              requires_grad=False).view(1, 1, 1, n_kernels)
        self.nn_scaler = nn.Parameter(
            torch.full([1], 0.01, dtype=torch.float32, requires_grad=True))
        self.mixer = nn.Parameter(
            torch.full([1, 1, 1], 0.5, dtype=torch.float32,
                       requires_grad=True))

        self.stacked_att = StackedSelfAttentionEncoder(
            input_dim=_embsize,
            hidden_dim=_embsize,
            projection_dim=att_proj_dim,
            feedforward_hidden_dim=att_ff_dim,
            num_layers=att_layer,
            num_attention_heads=att_heads,
            dropout_prob=0,
            residual_dropout_prob=0,
            attention_dropout_prob=0)

        # this does not really do "attention" - just a plain cosine matrix calculation (without learnable weights)
        self.cosine_module = CosineMatrixAttention()

        # bias is set to True in original code (we found it to not help, how could it?)
        self.dense = nn.Linear(n_kernels, 1, bias=False)
        self.dense_mean = nn.Linear(n_kernels, 1, bias=False)
        self.dense_comb = nn.Linear(2, 1, bias=False)

        # init with small weights, otherwise the dense output is way to high for the tanh -> resulting in loss == 1 all the time
        torch.nn.init.uniform_(self.dense.weight, -0.014,
                               0.014)  # inits taken from matchzoo
        torch.nn.init.uniform_(self.dense_mean.weight, -0.014,
                               0.014)  # inits taken from matchzoo

        # init with small weights, otherwise the dense output is way to high for the tanh -> resulting in loss == 1 all the time
        torch.nn.init.uniform_(self.dense.weight, -0.014,
                               0.014)  # inits taken from matchzoo
        #self.dense.bias.data.fill_(0.0)

    def forward(self,
                query_embeddings: torch.Tensor,
                document_embeddings: torch.Tensor,
                query_pad_oov_mask: torch.Tensor,
                document_pad_oov_mask: torch.Tensor,
                output_secondary_output: bool = False) -> torch.Tensor:
        # pylint: disable=arguments-differ

        query_embeddings = query_embeddings * query_pad_oov_mask.unsqueeze(-1)
        document_embeddings = document_embeddings * document_pad_oov_mask.unsqueeze(
            -1)

        query_embeddings_context = self.stacked_att(query_embeddings,
                                                    query_pad_oov_mask)
        document_embeddings_context = self.stacked_att(document_embeddings,
                                                       document_pad_oov_mask)

        #query_embeddings = torch.cat([query_embeddings,query_embeddings_context],dim=2) * query_pad_oov_mask.unsqueeze(-1)
        #document_embeddings = torch.cat([document_embeddings,document_embeddings_context],dim=2) * document_pad_oov_mask.unsqueeze(-1)
        query_embeddings = (self.mixer * query_embeddings +
                            (1 - self.mixer) * query_embeddings_context
                            ) * query_pad_oov_mask.unsqueeze(-1)
        document_embeddings = (self.mixer * document_embeddings +
                               (1 - self.mixer) * document_embeddings_context
                               ) * document_pad_oov_mask.unsqueeze(-1)

        #
        # prepare embedding tensors & paddings masks
        # -------------------------------------------------------

        query_by_doc_mask = torch.bmm(
            query_pad_oov_mask.unsqueeze(-1),
            document_pad_oov_mask.unsqueeze(-1).transpose(-1, -2))
        query_by_doc_mask_view = query_by_doc_mask.unsqueeze(-1)

        #
        # cosine matrix
        # -------------------------------------------------------

        # shape: (batch, query_max, doc_max)
        cosine_matrix = self.cosine_module.forward(query_embeddings,
                                                   document_embeddings)
        cosine_matrix_masked = cosine_matrix * query_by_doc_mask
        cosine_matrix_extradim = cosine_matrix_masked.unsqueeze(-1)

        #
        # gaussian kernels & soft-TF
        #
        # first run through kernel, then sum on doc dim then sum on query dim
        # -------------------------------------------------------

        raw_kernel_results = torch.exp(
            -torch.pow(cosine_matrix_extradim - self.mu, 2) /
            (2 * torch.pow(self.sigma, 2)))
        kernel_results_masked = raw_kernel_results * query_by_doc_mask_view

        #
        # mean kernels
        #
        #kernel_results_masked2 = kernel_results_masked.clone()

        doc_lengths = torch.sum(document_pad_oov_mask, 1)

        #kernel_results_masked2_mean = kernel_results_masked / doc_lengths.unsqueeze(-1)

        per_kernel_query = torch.sum(kernel_results_masked, 2)
        log_per_kernel_query = torch.log2(
            torch.clamp(per_kernel_query, min=1e-10)) * self.nn_scaler
        log_per_kernel_query_masked = log_per_kernel_query * query_pad_oov_mask.unsqueeze(
            -1)  # make sure we mask out padding values
        per_kernel = torch.sum(log_per_kernel_query_masked, 1)

        #per_kernel_query_mean = torch.sum(kernel_results_masked2_mean, 2)

        per_kernel_query_mean = per_kernel_query / (
            doc_lengths.view(-1, 1, 1) + 1
        )  # well, that +1 needs an explanation, sometimes training data is just broken ... (and nans all the things!)

        log_per_kernel_query_mean = per_kernel_query_mean * self.nn_scaler
        log_per_kernel_query_masked_mean = log_per_kernel_query_mean * query_pad_oov_mask.unsqueeze(
            -1)  # make sure we mask out padding values
        per_kernel_mean = torch.sum(log_per_kernel_query_masked_mean, 1)

        ##
        ## "Learning to rank" layer - connects kernels with learned weights
        ## -------------------------------------------------------

        dense_out = self.dense(per_kernel)
        dense_mean_out = self.dense_mean(per_kernel_mean)
        dense_comb_out = self.dense_comb(
            torch.cat([dense_out, dense_mean_out], dim=1))
        score = torch.squeeze(dense_comb_out, 1)  #torch.tanh(dense_out), 1)

        if output_secondary_output:
            query_mean_vector = query_embeddings.sum(
                dim=1) / query_pad_oov_mask.sum(dim=1).unsqueeze(-1)
            return score, {
                "score": score,
                "dense_out": dense_out,
                "dense_mean_out": dense_mean_out,
                "per_kernel": per_kernel,
                "per_kernel_mean": per_kernel_mean,
                "query_mean_vector": query_mean_vector,
                "cosine_matrix_masked": cosine_matrix_masked
            }
        else:
            return score

    def forward_representation(self, sequence_embeddings: torch.Tensor,
                               sequence_mask: torch.Tensor) -> torch.Tensor:
        seq_embeddings = sequence_embeddings * sequence_mask.unsqueeze(-1)
        seq_embeddings_context = self.stacked_att(sequence_embeddings,
                                                  sequence_mask)
        seq_embeddings = (self.mixer * sequence_embeddings + (1 - self.mixer) *
                          seq_embeddings_context) * sequence_mask.unsqueeze(-1)
        return seq_embeddings

    def get_param_stats(
        self
    ):  #" b: "+str(self.dense.bias.data) +\ "b: "+str(self.dense_mean.bias.data) +
        return "TK: dense w: "+str(self.dense.weight.data)+\
        "dense_mean weight: "+str(self.dense_mean.weight.data)+\
        "dense_comb weight: "+str(self.dense_comb.weight.data) + "scaler: "+str(self.nn_scaler.data) +"mixer: "+str(self.mixer.data)

    def get_param_secondary(self):
        return {
            "dense_weight": self.dense.weight,  #"dense_bias":self.dense.bias,
            "dense_mean_weight":
            self.dense_mean.weight,  #"dense_mean_bias":self.dense_mean.bias,
            "dense_comb_weight": self.dense_comb.weight,
            "scaler": self.nn_scaler,
            "mixer": self.mixer
        }

Ejemplo n.º 26

0

Mostrar archivo

Archivo: tk.py Proyecto: zhengd07/transformer-kernel-ranking

class TK_v2(nn.Module):
    '''
    TK is a neural IR model - a fusion between transformer contextualization & kernel-based scoring

    -> uses 1 transformer block to contextualize embeddings
    -> soft-histogram kernels to score interactions

    '''
    @staticmethod
    def from_config(config, word_embeddings_out_dim):

        ws = [20, 30, 50, 80, 100, 120, 150]
        max_windows = [
            math.ceil(config["max_doc_length"] / float(w)) for w in ws
        ]

        return TK_v2(word_embeddings_out_dim,
                     kernels_mu=config["tk_kernels_mu"],
                     kernels_sigma=config["tk_kernels_sigma"],
                     att_heads=config["tk_att_heads"],
                     att_layer=config["tk_att_layer"],
                     att_proj_dim=config["tk_att_proj_dim"],
                     att_ff_dim=config["tk_att_ff_dim"],
                     win_size=ws,
                     max_windows=max_windows)

    def __init__(self, _embsize: int, kernels_mu: List[float],
                 kernels_sigma: List[float], att_heads: int, att_layer: int,
                 att_proj_dim: int, att_ff_dim: int, win_size: int,
                 max_windows: int):

        super(TK_v2, self).__init__()

        n_kernels = len(kernels_mu)

        if len(kernels_mu) != len(kernels_sigma):
            raise Exception("len(kernels_mu) != len(kernels_sigma)")

        # static - kernel size & magnitude variables
        self.mu = Variable(torch.cuda.FloatTensor(kernels_mu),
                           requires_grad=False).view(1, 1, 1, n_kernels)
        self.sigma = Variable(torch.cuda.FloatTensor(kernels_sigma),
                              requires_grad=False).view(1, 1, 1, n_kernels)
        self.mixer = nn.Parameter(
            torch.full([1, 1, 1], 0.5, dtype=torch.float32,
                       requires_grad=True))

        self.stacked_att = StackedSelfAttentionEncoder(
            input_dim=_embsize,
            hidden_dim=_embsize,
            projection_dim=att_proj_dim,
            feedforward_hidden_dim=att_ff_dim,
            num_layers=att_layer,
            num_attention_heads=att_heads,
            dropout_prob=0,
            residual_dropout_prob=0,
            attention_dropout_prob=0)

        # this does not really do "attention" - just a plain cosine matrix calculation (without learnable weights)
        self.cosine_module = CosineMatrixAttention()

        self.nn_scaler = nn.ParameterList([
            nn.Parameter(
                torch.full([1], 0.01, dtype=torch.float32, requires_grad=True))
            for w in win_size
        ])

        self.kernel_weights = nn.ModuleList(
            [nn.Linear(n_kernels, 1, bias=False) for w in win_size])

        self.window_size = win_size
        self.window_scorer = []
        for w in max_windows:
            l = nn.Linear(w, 1, bias=False)
            torch.nn.init.constant_(l.weight, 1 / w)
            self.window_scorer.append(l)

        self.window_scorer = nn.ModuleList(self.window_scorer)

        self.window_merger = nn.Linear(len(self.window_size), 1, bias=False)

    def forward(self,
                query_embeddings: torch.Tensor,
                document_embeddings: torch.Tensor,
                query_pad_oov_mask: torch.Tensor,
                document_pad_oov_mask: torch.Tensor,
                output_secondary_output: bool = False) -> torch.Tensor:
        # pylint: disable=arguments-differ

        query_embeddings = query_embeddings * query_pad_oov_mask.unsqueeze(-1)
        document_embeddings = document_embeddings * document_pad_oov_mask.unsqueeze(
            -1)

        query_embeddings_context = self.stacked_att(query_embeddings,
                                                    query_pad_oov_mask)
        document_embeddings_context = self.stacked_att(document_embeddings,
                                                       document_pad_oov_mask)

        #query_embeddings = torch.cat([query_embeddings,query_embeddings_context],dim=2) * query_pad_oov_mask.unsqueeze(-1)
        #document_embeddings = torch.cat([document_embeddings,document_embeddings_context],dim=2) * document_pad_oov_mask.unsqueeze(-1)
        query_embeddings = (self.mixer * query_embeddings +
                            (1 - self.mixer) * query_embeddings_context
                            ) * query_pad_oov_mask.unsqueeze(-1)
        document_embeddings = (self.mixer * document_embeddings +
                               (1 - self.mixer) * document_embeddings_context
                               ) * document_pad_oov_mask.unsqueeze(-1)

        #
        # prepare embedding tensors & paddings masks
        # -------------------------------------------------------

        query_by_doc_mask = torch.bmm(
            query_pad_oov_mask.unsqueeze(-1),
            document_pad_oov_mask.unsqueeze(-1).transpose(-1, -2))
        query_by_doc_mask_view = query_by_doc_mask.unsqueeze(-1)

        #
        # cosine matrix
        # -------------------------------------------------------

        # shape: (batch, query_max, doc_max)
        cosine_matrix = self.cosine_module.forward(query_embeddings,
                                                   document_embeddings)
        cosine_matrix_masked = torch.tanh(cosine_matrix * query_by_doc_mask)
        cosine_matrix_extradim = cosine_matrix_masked.unsqueeze(-1)

        #
        # gaussian kernels & soft-TF
        #
        # first run through kernel, then sum on doc dim then sum on query dim
        # -------------------------------------------------------

        raw_kernel_results = torch.exp(
            -torch.pow(cosine_matrix_extradim - self.mu, 2) /
            (2 * torch.pow(self.sigma, 2)))
        kernel_results_masked = raw_kernel_results * query_by_doc_mask_view

        #
        # mean kernels
        #
        #kernel_results_masked2 = kernel_results_masked.clone()

        individual_window_scores = []

        for i, window in enumerate(self.window_size):

            kernel_results_masked = nn.functional.pad(
                kernel_results_masked,
                (0, 0, 0, window - kernel_results_masked.shape[-2] % window))

            scoring_windows = kernel_results_masked.unfold(dimension=-2,
                                                           size=window,
                                                           step=window)

            scoring_windows = scoring_windows.transpose(-1, -2)
            #kernel_results_masked2_mean = kernel_results_masked / doc_lengths.unsqueeze(-1)

            per_kernel_query = torch.sum(scoring_windows, -2)
            log_per_kernel_query = torch.log(
                torch.clamp(per_kernel_query, min=1e-10))  #*
            log_per_kernel_query_masked = log_per_kernel_query * (
                per_kernel_query.sum(dim=-1) != 0).unsqueeze(-1).float()
            #log_per_kernel_query_masked = log_per_kernel_query * query_pad_oov_mask.unsqueeze(-1).unsqueeze(-1) # make sure we mask out padding values
            per_kernel = torch.sum(log_per_kernel_query_masked, 1)

            window_scores = self.kernel_weights[i](per_kernel).squeeze(-1)

            window_scores_exp = torch.exp(
                window_scores *
                self.nn_scaler[i]) * (window_scores != 0).float()
            #window_scores_exp=window_scores
            if window_scores_exp.shape[-1] > self.window_scorer[i].in_features:
                window_scores_exp = window_scores_exp[:, :self.window_scorer[i]
                                                      .in_features]
            if window_scores_exp.shape[-1] < self.window_scorer[i].in_features:
                window_scores_exp = nn.functional.pad(
                    window_scores_exp, (0, self.window_scorer[i].in_features -
                                        window_scores_exp.shape[-1]))

            window_scores_exp = window_scores_exp.sort(dim=-1,
                                                       descending=True)[0]

            individual_window_scores.append(
                self.window_scorer[i](window_scores_exp))
        #final_score = window_scores.sum(dim=-1) / (window_scores != 0).sum(dim=-1).float()

        final_window_score = self.window_merger(
            torch.cat(individual_window_scores, dim=1))
        score = torch.squeeze(final_window_score,
                              1)  #torch.tanh(dense_out), 1)
        if output_secondary_output:
            return score, {}
        return score

    def get_param_stats(self):
        return "tk_v2: "+\
            " ".join([" kernel_weight ("+str(self.window_size[i])+")"+str(w.weight.data) for i,w in enumerate(self.kernel_weights)])+"\n"+\
            " ".join([" nn_scaler ("+str(self.window_size[i])+")"+str(w.data) for i,w in enumerate(self.nn_scaler)])+"\n"+\
            " ".join([" window_scorer ("+str(self.window_size[i])+")"+str(w.weight.data) for i,w in enumerate(self.window_scorer)])+"\n"+\
            "mixer: "+str(self.mixer.data) + "window_merger: "+str(self.window_merger.weight.data)

    def get_param_secondary(self):
        return {  #"dense_weight":self.dense.weight,"dense_bias":self.dense.bias,
            #"dense_mean_weight":self.dense_mean.weight,"dense_mean_bias":self.dense_mean.bias,
            "window_merger": self.window_merger.weight,
            #"scaler: ":self.nn_scaler ,
            "mixer: ": self.mixer
        }

Ejemplo n.º 27

0

Mostrar archivo

Archivo: matchpyramid.py Proyecto: zhengd07/transformer-kernel-ranking

class MatchPyramid(nn.Module):
    '''
    Paper: Text Matching as Image Recognition, Pang et al., AAAI'16

    Reference code (but in tensorflow):
    
    * first-hand: https://github.com/pl8787/MatchPyramid-TensorFlow/blob/master/model/model_mp.py
    
    * somewhat-third-hand reference: https://github.com/NTMC-Community/MatchZoo/blob/master/matchzoo/models/matchpyramid.py

    '''
    @staticmethod
    def from_config(config, word_embeddings_out_dim):
        return MatchPyramid(
            conv_output_size=config["match_pyramid_conv_output_size"],
            conv_kernel_size=config["match_pyramid_conv_kernel_size"],
            adaptive_pooling_size=config["match_pyramid_adaptive_pooling_size"]
        )

    def __init__(self, conv_output_size: List[int],
                 conv_kernel_size: List[Tuple[int, int]],
                 adaptive_pooling_size: List[Tuple[int, int]]):

        super(MatchPyramid, self).__init__()

        self.cosine_module = CosineMatrixAttention()

        if len(conv_output_size) != len(conv_kernel_size) or len(
                conv_output_size) != len(adaptive_pooling_size):
            raise Exception(
                "conv_output_size, conv_kernel_size, adaptive_pooling_size must have the same length"
            )

        conv_layer_dict = OrderedDict()
        last_channel_out = 1
        for i in range(len(conv_output_size)):
            conv_layer_dict["pad " + str(i)] = nn.ConstantPad2d(
                (0, conv_kernel_size[i][0] - 1, 0, conv_kernel_size[i][1] - 1),
                0)
            conv_layer_dict["conv " + str(i)] = nn.Conv2d(
                kernel_size=conv_kernel_size[i],
                in_channels=last_channel_out,
                out_channels=conv_output_size[i])
            conv_layer_dict["relu " + str(i)] = nn.ReLU()
            conv_layer_dict["pool " + str(i)] = nn.AdaptiveMaxPool2d(
                adaptive_pooling_size[i])
            last_channel_out = conv_output_size[i]

        self.conv_layers = nn.Sequential(conv_layer_dict)

        self.dense = nn.Linear(conv_output_size[-1] *
                               adaptive_pooling_size[-1][0] *
                               adaptive_pooling_size[-1][1],
                               out_features=100,
                               bias=True)
        self.dense2 = nn.Linear(100, out_features=10, bias=True)
        self.dense3 = nn.Linear(10, out_features=1, bias=False)

        # init with small weights, otherwise the dense output is way to high for the tanh -> resulting in loss == 1 all the time
        #torch.nn.init.uniform_(self.dense.weight, -0.014, 0.014)  # inits taken from matchzoo
        #self.dense.bias.data.fill_(0.0)

    def forward(self,
                query_embeddings: torch.Tensor,
                document_embeddings: torch.Tensor,
                query_pad_oov_mask: torch.Tensor,
                document_pad_oov_mask: torch.Tensor,
                output_secondary_output: bool = False) -> torch.Tensor:

        #
        # similarity matrix
        # -------------------------------------------------------

        cosine_matrix = self.cosine_module.forward(query_embeddings,
                                                   document_embeddings)
        # shape: (batch, 1, query_max, doc_max) for the input of conv_2d
        cosine_matrix = cosine_matrix[:, None, :, :]

        #
        # convolution
        # -------------------------------------------------------
        # shape: (batch, conv_output_size, query_max, doc_max)

        conv_result = self.conv_layers(cosine_matrix)

        #
        # dynamic pooling
        # -------------------------------------------------------

        # flatten the output of dynamic pooling

        # shape: (batch, conv_output_size * pool_h * pool_w)
        conv_result_flat = conv_result.view(conv_result.size(0), -1)

        #conv_result_flat = self.dropout(conv_result_flat)

        #
        # Learning to rank layer
        # -------------------------------------------------------
        dense_out = F.relu(self.dense(conv_result_flat))
        dense_out = F.relu(self.dense2(dense_out))
        dense_out = self.dense3(dense_out)
        #tanh_out = torch.tanh(dense_out)

        output = torch.squeeze(dense_out, 1)
        if output_secondary_output:
            return output, {}
        return output

    def get_param_stats(self):
        return "MP: / "

    def get_param_secondary(self):
        return {}

Ejemplo n.º 28

0

Mostrar archivo

Archivo: model_knrm.py Proyecto: bgeszti/ML

class KNRM(nn.Module):
    '''
    Paper: End-to-End Neural Ad-hoc Ranking with Kernel Pooling, Xiong et al., SIGIR'17
    '''
    def __init__(self, word_embeddings: TextFieldEmbedder, n_kernels: int):

        super(KNRM, self).__init__()

        self.word_embeddings = word_embeddings

        # static - kernel size & magnitude variables
        self.mu = Variable(torch.FloatTensor(self.kernel_mus(n_kernels)),
                           requires_grad=False).view(1, 1, 1, n_kernels)
        self.sigma = Variable(torch.FloatTensor(self.kernel_sigmas(n_kernels)),
                              requires_grad=False).view(1, 1, 1, n_kernels)

        #Cosine matrix
        self.cosine_module = CosineMatrixAttention()
        # Initialize the Linear transformer model:
        self.transform = nn.Linear(n_kernels, out_features=1, bias=True)

    def forward(self, query: Dict[str, torch.Tensor],
                document: Dict[str, torch.Tensor]) -> torch.Tensor:
        # pylint: disable=arguments-differ

        #
        # prepare embedding tensors & paddings masks
        # -------------------------------------------------------

        # shape: (batch, query_max)
        query_pad_oov_mask = (query["tokens"] >
                              0).float().cuda()  # > 1 to also mask oov terms
        # shape: (batch, doc_max)
        document_pad_oov_mask = (document["tokens"] > 0).float().cuda()

        # shape: (batch, query_max,emb_dim)
        query_embeddings = self.word_embeddings(query)
        # shape: (batch, document_max,emb_dim)
        document_embeddings = self.word_embeddings(document)

        #Create a mask matrix
        maskedEmbed = getMaskedEmbed(query_pad_oov_mask, document_pad_oov_mask)
        maskedEmbed = maskedEmbed.unsqueeze(-1).cuda()
        #
        # cosine matrix
        # -------------------------------------------------------
        matchMatrix = self.cosine_module.forward(query_embeddings,
                                                 document_embeddings)

        # Add an extra dimension the solve the dimensionality mismatch
        matchMatrix = matchMatrix.unsqueeze(-1).cuda()

        mu = self.mu
        mu = mu.cuda()

        sigma = self.sigma
        sigma = sigma.cuda()

        #Calculate the Soft-TF features from the Matchmatrix
        sofTFFeatures = calculateKernel(matchMatrix=matchMatrix,
                                        maskedMatrix=maskedEmbed,
                                        queryMask=query_pad_oov_mask,
                                        mu=mu,
                                        sigma=sigma)

        # apply linear transformation on the soft tf features,
        # calculate hyperbolic tangent on it
        # Remove the 2nd tensor dimension if it's size is 1
        output = torch.squeeze(torch.tanh(self.transform(sofTFFeatures)),
                               1).cuda()

        return output

    def kernel_mus(self, n_kernels: int):
        """
        get the mu for each guassian kernel. Mu is the middle of each bin
        :param n_kernels: number of kernels (including exact match). first one is exact match
        :return: l_mu, a list of mu.
        """
        l_mu = [1.0]
        if n_kernels == 1:
            return l_mu

        bin_size = 2.0 / (n_kernels - 1)  # score range from [-1, 1]
        l_mu.append(1 - bin_size / 2)  # mu: middle of the bin
        for i in range(1, n_kernels - 1):
            l_mu.append(l_mu[i] - bin_size)
        return l_mu

    def kernel_sigmas(self, n_kernels: int):
        """
        get sigmas for each guassian kernel.
        :param n_kernels: number of kernels (including exactmath.)
        :param lamb:
        :param use_exact:
        :return: l_sigma, a list of simga
        """
        bin_size = 2.0 / (n_kernels - 1)
        l_sigma = [0.0001]  # for exact match. small variance -> exact match
        if n_kernels == 1:
            return l_sigma

        l_sigma += [0.5 * bin_size] * (n_kernels - 1)
        return l_sigma

Ejemplo n.º 29

0

Mostrar archivo

Archivo: conv_knrm.py Proyecto: tuwien-information-retrieval/sigir19-neural-ir

class Conv_KNRM(nn.Module):
    '''
    Paper: Convolutional Neural Networks for Soſt-Matching N-Grams in Ad-hoc Search, Dai et al. WSDM 18

    third-hand reference: https://github.com/NTMC-Community/MatchZoo/blob/master/matchzoo/models/conv_knrm.py (tensorflow)
    https://github.com/thunlp/EntityDuetNeuralRanking/blob/master/baselines/CKNRM.py (pytorch)

    '''
    def __init__(self, word_embeddings: TextFieldEmbedder, n_grams: int,
                 n_kernels: int, conv_out_dim: int):

        super(Conv_KNRM, self).__init__()

        self.word_embeddings = word_embeddings

        # static - kernel size & magnitude variables
        self.mu = Variable(torch.cuda.FloatTensor(self.kernel_mus(n_kernels)),
                           requires_grad=False).view(1, 1, 1, n_kernels)
        self.sigma = Variable(torch.cuda.FloatTensor(
            self.kernel_sigmas(n_kernels)),
                              requires_grad=False).view(1, 1, 1, n_kernels)

        self.convolutions = []
        for i in range(1, n_grams + 1):
            self.convolutions.append(
                nn.Sequential(
                    nn.ConstantPad1d((0, i - 1), 0),
                    nn.Conv1d(kernel_size=i,
                              in_channels=word_embeddings.get_output_dim(),
                              out_channels=conv_out_dim), nn.ReLU()))
        self.convolutions = nn.ModuleList(
            self.convolutions)  # register conv as part of the model

        # this does not really do "attention" - just a plain cosine matrix calculation (without learnable weights)
        self.cosine_module = CosineMatrixAttention()

        # *9 because we concat the 3x3 conv match sums together before the dense layer
        self.dense = nn.Linear(n_kernels * n_grams * n_grams, 1, bias=False)

        # init with small weights, otherwise the dense output is way to high fot
        torch.nn.init.uniform_(self.dense.weight, -0.014,
                               0.014)  # inits taken from matchzoo

    def forward(self, query: Dict[str, torch.Tensor],
                document: Dict[str, torch.Tensor], query_length: torch.Tensor,
                document_length: torch.Tensor) -> torch.Tensor:

        #
        # prepare embedding tensors
        # -------------------------------------------------------

        # we assume 1 is the unknown token, 0 is padding - both need to be removed
        if len(query["tokens"].shape) == 2:  # (embedding lookup matrix)

            # shape: (batch, query_max)
            query_pad_oov_mask = (query["tokens"] > 1).float()
            # shape: (batch, doc_max)
            document_pad_oov_mask = (document["tokens"] > 1).float()

            # shape: (batch, query_max)
            query_pad_mask = (query["tokens"] > 0).float()
            # shape: (batch, doc_max)
            document_pad_mask = (document["tokens"] > 0).float()

        else:  # == 3 (elmo characters per word)

            # shape: (batch, query_max)
            query_pad_oov_mask = (torch.sum(query["tokens"], 2) > 0).float()
            query_pad_mask = query_pad_oov_mask
            # shape: (batch, doc_max)
            document_pad_oov_mask = (torch.sum(document["tokens"], 2) >
                                     0).float()
            document_pad_mask = document_pad_oov_mask

        query_by_doc_mask = torch.bmm(
            query_pad_mask.unsqueeze(-1),
            document_pad_mask.unsqueeze(-1).transpose(-1, -2))
        #query_by_doc_mask_view = query_by_doc_mask.unsqueeze(-1)

        # shape: (batch, query_max,emb_dim)
        query_embeddings = self.word_embeddings(
            query) * query_pad_oov_mask.unsqueeze(-1)
        # shape: (batch, document_max,emb_dim)
        document_embeddings = self.word_embeddings(
            document) * document_pad_oov_mask.unsqueeze(-1)

        # !! conv1d requires tensor in shape: [batch, emb_dim, sequence_length ]
        # so we transpose embedding tensors from : [batch, sequence_length,emb_dim] to [batch, emb_dim, sequence_length ]
        # feed that into the conv1d and reshape output from [batch, conv1d_out_channels, sequence_length ]
        # to [batch, sequence_length, conv1d_out_channels]

        query_embeddings_t = query_embeddings.transpose(1, 2)
        document_embeddings_t = document_embeddings.transpose(1, 2)

        query_results = []
        document_results = []

        for i, conv in enumerate(self.convolutions):
            query_conv = conv(query_embeddings_t).transpose(1, 2)
            document_conv = conv(document_embeddings_t).transpose(1, 2)

            query_results.append(query_conv)
            document_results.append(document_conv)

        matched_results = []

        for i in range(len(query_results)):
            for t in range(len(query_results)):
                matched_results.append(
                    self.forward_matrix_kernel_pooling(query_results[i],
                                                       document_results[t],
                                                       query_by_doc_mask,
                                                       query_pad_mask))

        #
        # "Learning to rank" layer
        # -------------------------------------------------------

        all_grams = torch.cat(matched_results, 1)

        dense_out = self.dense(all_grams)
        #tanh_out = torch.tanh(dense_out)

        output = torch.squeeze(dense_out, 1)
        return output

    #
    # create a match matrix between query & document terms
    #
    def forward_matrix_kernel_pooling(self, query_tensor, document_tensor,
                                      query_by_doc_mask, query_pad_oov_mask):

        #
        # cosine matrix
        # -------------------------------------------------------
        # shape: (batch, query_max, doc_max)

        cosine_matrix = self.cosine_module.forward(query_tensor,
                                                   document_tensor)
        cosine_matrix_masked = cosine_matrix * query_by_doc_mask
        cosine_matrix_extradim = cosine_matrix_masked.unsqueeze(-1)

        #
        # gaussian kernels & soft-TF
        #
        # first run through kernel, then sum on doc dim then sum on query dim
        # -------------------------------------------------------

        raw_kernel_results = torch.exp(
            -torch.pow(cosine_matrix_extradim - self.mu, 2) /
            (2 * torch.pow(self.sigma, 2)))
        kernel_results_masked = raw_kernel_results * query_by_doc_mask.unsqueeze(
            -1)

        per_kernel_query = torch.sum(kernel_results_masked, 2)
        log_per_kernel_query = torch.log(
            torch.clamp(per_kernel_query, min=1e-10)) * 0.01
        log_per_kernel_query_masked = log_per_kernel_query * query_pad_oov_mask.unsqueeze(
            -1)  # make sure we mask out padding values

        per_kernel = torch.sum(log_per_kernel_query_masked, 1)

        return per_kernel

    def kernel_mus(self, n_kernels: int):
        """
        get the mu for each guassian kernel. Mu is the middle of each bin
        :param n_kernels: number of kernels (including exact match). first one is exact match
        :return: l_mu, a list of mu.
        """
        l_mu = [1.0]
        if n_kernels == 1:
            return l_mu

        bin_size = 2.0 / (n_kernels - 1)  # score range from [-1, 1]
        l_mu.append(1 - bin_size / 2)  # mu: middle of the bin
        for i in range(1, n_kernels - 1):
            l_mu.append(l_mu[i] - bin_size)
        return l_mu

    def kernel_sigmas(self, n_kernels: int):
        """
        get sigmas for each guassian kernel.
        :param n_kernels: number of kernels (including exactmath.)
        :param lamb:
        :param use_exact:
        :return: l_sigma, a list of simga
        """
        bin_size = 2.0 / (n_kernels - 1)
        l_sigma = [0.001]  # for exact match. small variance -> exact match
        if n_kernels == 1:
            return l_sigma

        l_sigma += [0.5 * bin_size] * (n_kernels - 1)
        return l_sigma

Ejemplo n.º 30

0

Mostrar archivo

Archivo: knrm.py Proyecto: tuwien-information-retrieval/sigir19-neural-ir

class KNRM(nn.Module):
    '''
    Paper: End-to-End Neural Ad-hoc Ranking with Kernel Pooling, Xiong et al., SIGIR'17

    Reference code (paper author): https://github.com/AdeDZY/K-NRM/blob/master/knrm/model/model_knrm.py (but in tensorflow)
    third-hand reference: https://github.com/NTMC-Community/MatchZoo/blob/master/matchzoo/models/knrm.py

    '''
    def __init__(self, word_embeddings: TextFieldEmbedder, n_kernels: int):

        super(KNRM, self).__init__()

        self.word_embeddings = word_embeddings

        # static - kernel size & magnitude variables
        self.mu = Variable(torch.cuda.FloatTensor(self.kernel_mus(n_kernels)),
                           requires_grad=False).view(1, 1, 1, n_kernels)
        self.sigma = Variable(torch.cuda.FloatTensor(
            self.kernel_sigmas(n_kernels)),
                              requires_grad=False).view(1, 1, 1, n_kernels)

        # this does not really do "attention" - just a plain cosine matrix calculation (without learnable weights)
        self.cosine_module = CosineMatrixAttention()

        # bias is set to True in original code (we found it to not help, how could it?)
        self.dense = nn.Linear(n_kernels, 1, bias=False)

        # init with small weights, otherwise the dense output is way to high for the tanh -> resulting in loss == 1 all the time
        torch.nn.init.uniform_(self.dense.weight, -0.014,
                               0.014)  # inits taken from matchzoo
        #self.dense.bias.data.fill_(0.0)

    def forward(self, query: Dict[str, torch.Tensor],
                document: Dict[str, torch.Tensor], query_length: torch.Tensor,
                document_length: torch.Tensor) -> torch.Tensor:
        # pylint: disable=arguments-differ

        #
        # prepare embedding tensors & paddings masks
        # -------------------------------------------------------

        # shape: (batch, query_max,emb_dim)
        query_embeddings = self.word_embeddings(query)

        # shape: (batch, document_max,emb_dim)
        document_embeddings = self.word_embeddings(document)

        # we assume 1 is the unknown token, 0 is padding - both need to be removed

        if len(query["tokens"].shape) == 2:  # (embedding lookup matrix)
            # shape: (batch, query_max)
            query_pad_oov_mask = (query["tokens"] > 1).float()
            # shape: (batch, doc_max)
            document_pad_oov_mask = (document["tokens"] > 1).float()
        else:  # == 3 (elmo characters per word)
            # shape: (batch, query_max)
            query_pad_oov_mask = (torch.sum(query["tokens"], 2) > 0).float()
            # shape: (batch, doc_max)
            document_pad_oov_mask = (torch.sum(document["tokens"], 2) >
                                     0).float()

        query_by_doc_mask = torch.bmm(
            query_pad_oov_mask.unsqueeze(-1),
            document_pad_oov_mask.unsqueeze(-1).transpose(-1, -2))
        query_by_doc_mask_view = query_by_doc_mask.unsqueeze(-1)

        #
        # cosine matrix
        # -------------------------------------------------------

        # shape: (batch, query_max, doc_max)
        cosine_matrix = self.cosine_module.forward(query_embeddings,
                                                   document_embeddings)
        cosine_matrix_masked = cosine_matrix * query_by_doc_mask
        cosine_matrix_extradim = cosine_matrix_masked.unsqueeze(-1)

        #
        # gaussian kernels & soft-TF
        #
        # first run through kernel, then sum on doc dim then sum on query dim
        # -------------------------------------------------------

        raw_kernel_results = torch.exp(
            -torch.pow(cosine_matrix_extradim - self.mu, 2) /
            (2 * torch.pow(self.sigma, 2)))
        kernel_results_masked = raw_kernel_results * query_by_doc_mask_view

        per_kernel_query = torch.sum(kernel_results_masked, 2)
        log_per_kernel_query = torch.log(
            torch.clamp(per_kernel_query, min=1e-10)) * 0.01
        log_per_kernel_query_masked = log_per_kernel_query * query_pad_oov_mask.unsqueeze(
            -1)  # make sure we mask out padding values

        per_kernel = torch.sum(log_per_kernel_query_masked, 1)

        ##
        ## "Learning to rank" layer - connects kernels with learned weights
        ## -------------------------------------------------------

        dense_out = self.dense(per_kernel)
        score = torch.squeeze(dense_out, 1)  #torch.tanh(dense_out), 1)
        return score

    def kernel_mus(self, n_kernels: int):
        """
        get the mu for each guassian kernel. Mu is the middle of each bin
        :param n_kernels: number of kernels (including exact match). first one is exact match
        :return: l_mu, a list of mu.
        """
        l_mu = [1.0]
        if n_kernels == 1:
            return l_mu

        bin_size = 2.0 / (n_kernels - 1)  # score range from [-1, 1]
        l_mu.append(1 - bin_size / 2)  # mu: middle of the bin
        for i in range(1, n_kernels - 1):
            l_mu.append(l_mu[i] - bin_size)
        return l_mu

    def kernel_sigmas(self, n_kernels: int):
        """
        get sigmas for each guassian kernel.
        :param n_kernels: number of kernels (including exactmath.)
        :param lamb:
        :param use_exact:
        :return: l_sigma, a list of simga
        """
        bin_size = 2.0 / (n_kernels - 1)
        l_sigma = [0.0001]  # for exact match. small variance -> exact match
        if n_kernels == 1:
            return l_sigma

        l_sigma += [0.5 * bin_size] * (n_kernels - 1)
        return l_sigma