class AbsolutePositionalEmbedding(AbstractEmbedding):
    def __init__(self, device):
        super(AbsolutePositionalEmbedding, self).__init__(device=device)
        self.max_length = 150
        self.indexer = Indexer(special_tokens={
            '<s>': 0,
            '<unk>': 1,
            '<pad>': 2,
            '<\s>': 3,
            '<mask>': 4
        },
                               with_del_stopwords=self.with_del_stopwords)
        self.indexer.add_sentence(list(map(str, range(self.max_length))),
                                  with_raw=True)
        self.embedding_dim = 20
        self.embedding = nn.Embedding(num_embeddings=len(self.indexer),
                                      embedding_dim=self.embedding_dim,
                                      padding_idx=self.indexer.padding_index)
        self.embedding.to(device)

    def forward(self, sentences):
        sentences = [self.indexer.tokenize(sentence) for sentence in sentences]
        sentences = [[str(i) for i, _ in enumerate(sentence)]
                     for sentence in sentences]
        indexes = [[self.indexer.get_index(word) for word in sentence]
                   for sentence in sentences]
        pad_indexes = self.pad_sequence(indexes)
        pad_indexes = torch.Tensor(pad_indexes).long().to(self.device)
        vectors = self.embedding(pad_indexes)
        return vectors
예제 #2
0
class PostagEmbedding(AbstractEmbedding):
    def __init__(self, device):
        super(PostagEmbedding, self).__init__(device=device)
        self.indexer = Indexer(
            special_tokens={
                '<s>': 0,
                '<unk>': 1,
                '<pad>': 2,
                '<\s>': 3,
                '<mask>': 4
            },
            with_del_stopwords=False)  # postag embedding の場合だけ必ずFalse
        datasets = Dataset().get_instance()
        sentences = [
            nltk.pos_tag(self.indexer.tokenize(pairs[0]))
            for pairs in datasets['train']
        ]
        sentences = [[pairs[1] for pairs in sentence]
                     for sentence in sentences]
        for sentence in sentences:
            self.indexer.add_sentence(sentence, with_raw=True)
        self.embedding_dim = 10
        self.embedding = nn.Embedding(num_embeddings=len(self.indexer),
                                      embedding_dim=self.embedding_dim,
                                      padding_idx=self.indexer.padding_index)
        self.embedding.to(device)

    def forward(self, sentences):
        if self.with_del_stopwords:
            postags = [
                nltk.pos_tag(self.indexer.tokenize(sentence))
                for sentence in sentences
            ]
            sentences = [[pairs[0] for pairs in postag] for postag in postags]
            postags = [[pairs[1] for pairs in postag] for postag in postags]
            is_stopword = self.indexer.is_stopword(sentences)
            postags = [[tag for sw, tag in zip(stopword, postag) if sw != 1]
                       for stopword, postag in zip(is_stopword, postags)]
        else:
            postags = [
                nltk.pos_tag(self.indexer.tokenize(sentence))
                for sentence in sentences
            ]
            postags = [[pairs[1] for pairs in postag] for postag in postags]
        indexes = [[self.indexer.get_index(tag) for tag in postag]
                   for postag in postags]
        pad_indexes = self.pad_sequence(indexes)
        pad_indexes = torch.Tensor(pad_indexes).long().to(self.device)
        vectors = self.embedding(pad_indexes)
        return vectors