コード例 #1
0
ファイル: data.py プロジェクト: WEYAI/PhoNLP
    def __getitem__(self, key):
        """ Get a batch with index. """
        if not isinstance(key, int):
            raise TypeError
        if key < 0:  # or key % len(self.data):
            raise IndexError
        key = key % len(self.data)
        batch = self.data[key]
        batch_size = len(batch)
        batch = list(zip(*batch))
        assert len(batch) == 4

        # sort sentences by lens for easy RNN operations
        sentlens = [len(x) for x in batch[2]]
        batch, orig_idx = sort_all(batch, sentlens)
        sentlens = [len(x) for x in batch[2]]

        words = get_long_tensor(batch[2], batch_size)
        words_mask = torch.eq(words, PAD_ID)
        # convert to tensors
        tokens_phobert = batch[0]
        tokens_phobert = get_long_tensor(tokens_phobert, batch_size, pad_id=1)
        first_subword = batch[1]
        first_subword = get_long_tensor(first_subword, batch_size)
        tags = get_long_tensor(batch[3], batch_size)
        return tokens_phobert, first_subword, words_mask, tags, orig_idx, sentlens
コード例 #2
0
ファイル: annotate_model.py プロジェクト: nlptrinh/PhoNLP
    def get_batch(self, key, data_chunk):
        batch = data_chunk[key]
        batch_size = len(batch)
        batch = list(zip(*batch))
        assert len(batch) == 4
        # print(batch)
        lens = [len(x) for x in batch[2]]
        batch, orig_idx = sort_all(batch, lens)

        batch_words = [w for sent in batch[3] for w in sent]
        word_lens = [len(x) for x in batch_words]
        batch_words, word_orig_idx = sort_all([batch_words], word_lens)
        batch_words = batch_words[0]  # [word1,...], word1 = list of tokens
        word_lens = [len(x) for x in batch_words]
        wordchars = get_long_tensor(batch_words, len(word_lens))
        number_of_words = wordchars.size(0)
        words = batch[2]
        words = get_long_tensor(words, batch_size)
        words_mask = torch.eq(words, PAD_ID)

        # convert to tensors
        tokens_phobert = batch[0]
        tokens_phobert = get_long_tensor(tokens_phobert, batch_size, pad_id=1)
        first_subword = batch[1]
        first_subword = get_long_tensor(first_subword, batch_size)
        sentlens = [len(x) for x in batch[1]]
        data = (tokens_phobert, first_subword, words_mask, number_of_words,
                orig_idx, sentlens)
        return data
コード例 #3
0
ファイル: data.py プロジェクト: WEYAI/PhoNLP
    def __getitem__(self, key):
        """ Get a batch with index. """
        if not isinstance(key, int):
            raise TypeError
        if key > 0 and key % len(self.data) == 0:
            self.reshuffle()
        batch_key = key % len(self.data)
        batch = self.data[batch_key]
        batch_size = len(batch)
        batch = list(zip(*batch))
        assert len(batch) == 3

        lens = [len(x) for x in batch[2]]
        batch, orig_idx = sort_all(batch, lens)

        # convert to tensors
        tokens_phobert = batch[0]
        tokens_phobert = get_long_tensor(tokens_phobert, batch_size, pad_id=1)
        first_subword = batch[1]
        first_subword = get_long_tensor(first_subword, batch_size)
        upos = get_long_tensor(batch[2], batch_size)
        sentlens = [len(x) for x in batch[1]]
        return tokens_phobert, first_subword, upos, orig_idx, sentlens
コード例 #4
0
ファイル: data.py プロジェクト: WEYAI/PhoNLP
    def __getitem__(self, key):
        """ Get a batch with index. """
        if not isinstance(key, int):
            raise TypeError
        if key > 0 and key % len(self.data_dep) == 0:
            self.reshuffle()
        dep_key = key % len(self.data_dep)
        dep_batch = self.data_dep[dep_key]
        dep_batch_size = len(dep_batch)
        dep_batch = list(zip(*dep_batch))

        assert len(dep_batch) == 6

        dep_lens = [len(x) for x in dep_batch[2]]
        dep_batch, dep_orig_idx = sort_all(dep_batch, dep_lens)

        dep_batch_words = [w for sent in dep_batch[3] for w in sent]
        dep_word_lens = [len(x) for x in dep_batch_words]
        dep_batch_words, dep_word_orig_idx = sort_all([dep_batch_words],
                                                      dep_word_lens)
        dep_batch_words = dep_batch_words[
            0]  # [word1,...], word1 = list of tokens
        dep_word_lens = [len(x) for x in dep_batch_words]
        dep_wordchars = get_long_tensor(dep_batch_words, len(dep_word_lens))
        dep_number_of_words = dep_wordchars.size(0)
        dep_words = dep_batch[2]
        dep_words = get_long_tensor(dep_words, dep_batch_size)
        dep_words_mask = torch.eq(dep_words, PAD_ID)

        # convert to tensors
        dep_tokens_phobert = dep_batch[0]
        dep_tokens_phobert = get_long_tensor(dep_tokens_phobert,
                                             dep_batch_size,
                                             pad_id=1)
        dep_first_subword = dep_batch[1]
        dep_first_subword = get_long_tensor(dep_first_subword, dep_batch_size)
        dep_sentlens = [len(x) for x in dep_batch[1]]
        dep_head = get_long_tensor(dep_batch[4], dep_batch_size)
        dep_deprel = get_long_tensor(dep_batch[5], dep_batch_size)
        dep_data = (
            dep_tokens_phobert,
            dep_first_subword,
            dep_words_mask,
            dep_head,
            dep_deprel,
            dep_number_of_words,
            dep_orig_idx,
            dep_sentlens,
        )
        return dep_data