Beispiel #1
0
    def __init__(self,
                 char_embed,
                 num_classes,
                 bigram_embed=None,
                 trigram_embed=None,
                 num_layers=1,
                 hidden_size=100,
                 dropout=0.5,
                 target_vocab=None,
                 encoding_type=None):
        super().__init__()

        self.char_embed = get_embeddings(char_embed)
        embed_size = self.char_embed.embedding_dim
        if bigram_embed:
            self.bigram_embed = get_embeddings(bigram_embed)
            embed_size += self.bigram_embed.embedding_dim
        if trigram_embed:
            self.trigram_ebmbed = get_embeddings(trigram_embed)
            embed_size += self.bigram_embed.embedding_dim

        if num_layers > 1:
            self.lstm = LSTM(embed_size,
                             num_layers=num_layers,
                             hidden_size=hidden_size // 2,
                             bidirectional=True,
                             batch_first=True,
                             dropout=dropout)
        else:
            self.lstm = LSTM(embed_size,
                             num_layers=num_layers,
                             hidden_size=hidden_size // 2,
                             bidirectional=True,
                             batch_first=True)

        self.dropout = nn.Dropout(dropout)
        self.fc = nn.Linear(hidden_size, num_classes)

        trans = None
        if target_vocab is not None and encoding_type is not None:
            trans = allowed_transitions(target_vocab.idx2word,
                                        encoding_type=encoding_type,
                                        include_start_end=True)

        self.crf = ConditionalRandomField(num_classes,
                                          include_start_end_trans=True,
                                          allowed_transitions=trans)
Beispiel #2
0
    def __init__(self, init_embed, num_cls):
        super(HANCLS, self).__init__()

        self.embed = get_embeddings(init_embed)
        self.han = HAN(input_size=300,
                       output_size=num_cls,
                       word_hidden_size=50, word_num_layers=1, word_context_size=100,
                       sent_hidden_size=50, sent_num_layers=1, sent_context_size=100
                       )
 def __init__(self, init_embed,
              num_classes,
              hidden_dim=256,
              num_layers=1,
              attention_unit=256,
              attention_hops=1,
              nfc=128):
     super(BiLSTM_SELF_ATTENTION,self).__init__()
     self.embed = get_embeddings(init_embed)
     self.lstm = LSTM(input_size=self.embed.embedding_dim, hidden_size=hidden_dim, num_layers=num_layers, bidirectional=True)
     self.attention = SelfAttention(input_size=hidden_dim * 2 , attention_unit=attention_unit, attention_hops=attention_hops)
     self.mlp = MLP(size_layer=[hidden_dim* 2*attention_hops, nfc, num_classes])
Beispiel #4
0
 def __init__(self,
              init_embed,
              num_classes,
              hidden_dim=256,
              num_layers=1,
              nfc=128):
     super(BiLSTMSentiment, self).__init__()
     self.embed = get_embeddings(init_embed)
     self.lstm = LSTM(input_size=self.embed.embedding_dim,
                      hidden_size=hidden_dim,
                      num_layers=num_layers,
                      bidirectional=True)
     self.mlp = MLP(size_layer=[hidden_dim * 2, nfc, num_classes])
Beispiel #5
0
 def __init__(self, init_embed, out_dim=300, kernel_sizes=None):
     super().__init__()
     if kernel_sizes is None:
         kernel_sizes = [5, 9]
     assert isinstance(kernel_sizes,
                       list), 'kernel_sizes should be List(int)'
     self.embed = get_embeddings(init_embed)
     try:
         embed_dim = self.embed.embedding_dim
     except Exception:
         embed_dim = self.embed.embed_size
     self.region_embeds = nn.ModuleList()
     for ksz in kernel_sizes:
         self.region_embeds.append(
             nn.Sequential(
                 nn.Conv1d(embed_dim, embed_dim, ksz, padding=ksz // 2), ))
     self.linears = nn.ModuleList([
         nn.Conv1d(embed_dim, out_dim, 1) for _ in range(len(kernel_sizes))
     ])
     self.embedding_dim = embed_dim
Beispiel #6
0
    def __init__(self,
                 vocab: Vocabulary,
                 embed_size: int = 30,
                 char_emb_size: int = 30,
                 word_dropout: float = 0,
                 dropout: float = 0,
                 pool_method: str = 'max',
                 activation='relu',
                 min_char_freq: int = 2,
                 requires_grad=True,
                 include_word_start_end=True,
                 char_attn_type='adatrans',
                 char_n_head=3,
                 char_dim_ffn=60,
                 char_scale=False,
                 char_pos_embed=None,
                 char_dropout=0.15,
                 char_after_norm=False):
        """
        :param vocab: 词表
        :param embed_size: TransformerCharEmbed的输出维度。默认值为50.
        :param char_emb_size: character的embedding的维度。默认值为50. 同时也是Transformer的d_model大小
        :param float word_dropout: 以多大的概率将一个词替换为unk。这样既可以训练unk也是一定的regularize。
        :param dropout: 以多大概率drop character embedding的输出以及最终的word的输出。
        :param pool_method: 支持'max', 'avg'。
        :param activation: 激活函数,支持'relu', 'sigmoid', 'tanh', 或者自定义函数.
        :param min_char_freq: character的最小出现次数。默认值为2.
        :param requires_grad:
        :param include_word_start_end: 是否使用特殊的tag标记word的开始与结束
        :param char_attn_type: adatrans or naive.
        :param char_n_head: 多少个head
        :param char_dim_ffn: transformer中ffn中间层的大小
        :param char_scale: 是否使用scale
        :param char_pos_embed: None, 'fix', 'sin'. What kind of position embedding. When char_attn_type=relative, None is
            ok
        :param char_dropout: Dropout in Transformer encoder
        :param char_after_norm: the normalization place.
        """
        super(TransformerCharEmbed, self).__init__(vocab,
                                                   word_dropout=word_dropout,
                                                   dropout=dropout)

        assert char_emb_size % char_n_head == 0, "d_model should divide n_head."

        assert pool_method in ('max', 'avg')
        self.pool_method = pool_method
        # activation function
        if isinstance(activation, str):
            if activation.lower() == 'relu':
                self.activation = F.relu
            elif activation.lower() == 'sigmoid':
                self.activation = F.sigmoid
            elif activation.lower() == 'tanh':
                self.activation = F.tanh
        elif activation is None:
            self.activation = lambda x: x
        elif callable(activation):
            self.activation = activation
        else:
            raise Exception(
                "Undefined activation function: choose from: [relu, tanh, sigmoid, or a callable function]"
            )

        logger.info("Start constructing character vocabulary.")
        # 建立char的词表
        self.char_vocab = _construct_char_vocab_from_vocab(
            vocab,
            min_freq=min_char_freq,
            include_word_start_end=include_word_start_end)
        self.char_pad_index = self.char_vocab.padding_idx
        logger.info(
            f"In total, there are {len(self.char_vocab)} distinct characters.")
        # 对vocab进行index
        max_word_len = max(map(lambda x: len(x[0]), vocab))
        if include_word_start_end:
            max_word_len += 2
        self.register_buffer(
            'words_to_chars_embedding',
            torch.full((len(vocab), max_word_len),
                       fill_value=self.char_pad_index,
                       dtype=torch.long))
        self.register_buffer('word_lengths', torch.zeros(len(vocab)).long())
        for word, index in vocab:
            # if index!=vocab.padding_idx:  # 如果是pad的话,直接就为pad_value了. 修改为不区分pad与否
            if include_word_start_end:
                word = ['<bow>'] + list(word) + ['<eow>']
            self.words_to_chars_embedding[index, :len(word)] = \
                torch.LongTensor([self.char_vocab.to_index(c) for c in word])
            self.word_lengths[index] = len(word)

        self.char_embedding = get_embeddings(
            (len(self.char_vocab), char_emb_size))
        self.transformer = TransformerEncoder(1,
                                              char_emb_size,
                                              char_n_head,
                                              char_dim_ffn,
                                              dropout=char_dropout,
                                              after_norm=char_after_norm,
                                              attn_type=char_attn_type,
                                              pos_embed=char_pos_embed,
                                              scale=char_scale)
        self.fc = nn.Linear(char_emb_size, embed_size)

        self._embed_size = embed_size

        self.requires_grad = requires_grad
Beispiel #7
0
    def __init__(self,
                 vocab: Vocabulary,
                 embed_size: int = 30,
                 char_emb_size: int = 30,
                 word_dropout: float = 0,
                 dropout: float = 0,
                 pool_method: str = 'max',
                 activation='relu',
                 min_char_freq: int = 2,
                 requires_grad=True,
                 include_word_start_end=True,
                 char_attn_type='adatrans',
                 char_n_head=3,
                 char_dim_ffn=60,
                 char_scale=False,
                 char_pos_embed=None,
                 char_dropout=0.15,
                 char_after_norm=False):
        super(TransformerCharEmbed, self).__init__(vocab,
                                                   word_dropout=word_dropout,
                                                   dropout=dropout)

        assert char_emb_size % char_n_head == 0, "d_model should divide n_head."

        assert pool_method in ('max', 'avg')
        self.pool_method = pool_method
        # activation function
        if isinstance(activation, str):
            if activation.lower() == 'relu':
                self.activation = F.relu
            elif activation.lower() == 'sigmoid':
                self.activation = F.sigmoid
            elif activation.lower() == 'tanh':
                self.activation = F.tanh
        elif activation is None:
            self.activation = lambda x: x
        elif callable(activation):
            self.activation = activation
        else:
            raise Exception(
                "Undefined activation function: choose from: [relu, tanh, sigmoid, or a callable function]"
            )

        logger.info("Start constructing character vocabulary.")

        self.char_vocab = _construct_char_vocab_from_vocab(
            vocab,
            min_freq=min_char_freq,
            include_word_start_end=include_word_start_end)
        self.char_pad_index = self.char_vocab.padding_idx
        logger.info(
            f"In total, there are {len(self.char_vocab)} distinct characters.")

        max_word_len = max(map(lambda x: len(x[0]), vocab))
        if include_word_start_end:
            max_word_len += 2
        self.register_buffer(
            'words_to_chars_embedding',
            torch.full((len(vocab), max_word_len),
                       fill_value=self.char_pad_index,
                       dtype=torch.long))
        self.register_buffer('word_lengths', torch.zeros(len(vocab)).long())
        for word, index in vocab:

            if include_word_start_end:
                word = ['<bow>'] + list(word) + ['<eow>']
            self.words_to_chars_embedding[index, :len(word)] = \
                torch.LongTensor([self.char_vocab.to_index(c) for c in word])
            self.word_lengths[index] = len(word)

        self.char_embedding = get_embeddings(
            (len(self.char_vocab), char_emb_size))
        self.transformer = TransformerEncoder(1,
                                              char_emb_size,
                                              char_n_head,
                                              char_dim_ffn,
                                              dropout=char_dropout,
                                              after_norm=char_after_norm,
                                              attn_type=char_attn_type,
                                              pos_embed=char_pos_embed,
                                              scale=char_scale)
        self.fc = nn.Linear(char_emb_size, embed_size)

        self._embed_size = embed_size

        self.requires_grad = requires_grad