コード例 #1
0
    def to_input_tensor_char(self, sents: List[List[str]],
                             device: torch.device) -> torch.Tensor:
        """ Convert list of sentences (words) into tensor with necessary padding for 
        shorter sentences.

        @param sents (List[List[str]]): list of sentences (words)
        @param device: device on which to load the tensor, i.e. CPU or GPU

        @returns sents_var: tensor of (max_sentence_length, batch_size, max_word_length)
        """
        ### YOUR CODE HERE for part 1g
        ### TODO:
        ###     Connect `words2charindices()` and `pad_sents_char()` which you've defined in
        ###     previous parts
        batch_size = len(sents)
        max_sent_len = max([len(s) for s in sents])
        max_word_len = 21
        char2indices = self.words2charindices(sents)
        sents_paded = pad_sents_char(char2indices, self.char2id['<pad>'])
        sents_tensor = torch.tensor(sents_paded,
                                    dtype=torch.long,
                                    device=device)
        sents_tensor = sents_tensor.view(max_sent_len, batch_size,
                                         max_word_len)
        return sents_tensor
コード例 #2
0
ファイル: vocab.py プロジェクト: st2yang/CS224nAssignments
    def to_input_tensor_char(self, sents: List[List[str]],
                             device: torch.device) -> torch.Tensor:
        """ Convert list of sentences (words) into tensor with necessary padding for
        shorter sentences.

        @param sents (List[List[str]]): list of sentences (words)
        @param device: device on which to load the tensor, i.e. CPU or GPU

        @returns sents_var: tensor of (max_sentence_length, batch_size, max_word_length)
        """
        ### YOUR CODE HERE for part 1e
        ### TODO:
        ###     - Use `words2charindices()` from this file, which converts each character to its corresponding index in the
        ###       character-vocabulary.
        ###     - Use `pad_sents_char()` from utils.py, which pads all words to max_word_length of all words in the batch,
        ###       and pads all sentences to max length of all sentences in the batch. Read __init__ to see how to get
        ###       index of character-padding token
        ###     - Connect these two parts to convert the resulting padded sentences to a torch tensor.
        ### HINT:
        ###     - You may find .contiguous() useful after reshaping. Check the following links for more details:
        ###         https://pytorch.org/docs/stable/tensors.html#torch.Tensor.contiguous
        ###         https://pytorch.org/docs/stable/tensors.html#torch.Tensor.view

        char_indices = self.words2charindices(sents)
        sents_var = torch.tensor(pad_sents_char(char_indices, 0),
                                 dtype=torch.long,
                                 device=device).permute(
                                     (1, 0, 2)).contiguous()

        return sents_var
コード例 #3
0
ファイル: vocab.py プロジェクト: evazhang612/DOC_CNN
    def to_input_tensor_char(self, sents: List[List[str]]) -> torch.Tensor:
        """ Convert list of sentences (words) into tensor with necessary padding for
        shorter sentences.

        @param sents (List[List[str]]): list of sentences (words)
        @param device: device on which to load the tensor, i.e. CPU or GPU

        @returns sents_var: tensor of (max_sentence_length, batch_size, max_word_length)
        """
        ### YOUR CODE HERE for part 1g
        ### TODO:
        ###     Connect `words2charindices()` and `pad_sents_char()` which you've defined in
        ###     previous parts
        max_sentence_length = max(len(s) for s in sents)
        batch_size = len(sents)
        max_word_length = 21

        #        print(sents)

        char_ids = self.words2charindices(sents)
        sents_t = pad_sents_char(char_ids, self.char2id['<pad>'])
        sents_var = torch.LongTensor(sents_t)
        sents_var = sents_var.permute(1, 0, 2).contiguous()

        return sents_var
コード例 #4
0
    def to_input_tensor_char(self, sents: List[List[str]],
                             device: torch.device) -> torch.Tensor:
        """ Convert list of sentences (words) into tensor with necessary padding for 
        shorter sentences.

        @param sents (List[List[str]]): list of sentences (words)
        @param device: device on which to load the tensor, i.e. CPU or GPU

        @returns sents_var: tensor of (max_sentence_length, batch_size, max_word_length)
        """
        ### YOUR CODE HERE for part 1g
        ### TODO:
        ###     Connect `words2charindices()` and `pad_sents_char()` which you've defined in
        ###     previous parts

        sents_tmp = self.words2charindices(sents)
        sents_tmp_padded = pad_sents_char(sents_tmp, self.char2id['<pad>'])
        #print(sents_tmp_padded)
        #print(len(sents_tmp_padded))
        #sents_var = torch.as_tensor(data=sents_tmp_padded,dtype=torch.int64,device=device)
        sents_var = torch.tensor(sents_tmp_padded,
                                 dtype=torch.long,
                                 device=device).contiguous()
        #print(sents_var.size())
        sents_var = sents_var.permute(1, 0, 2).contiguous()
        #print('size of data - sents_var is',sents_var.size())
        #print(sents_var.size())
        return sents_var
コード例 #5
0
    def to_input_tensor_char(self, sents: List[List[str]],
                             device: torch.device) -> torch.Tensor:
        """ Convert list of sentences (words) into tensor with necessary padding for 
        shorter sentences.

        @param sents (List[List[str]]): list of sentences (words)
        @param device: device on which to load the tensor, i.e. CPU or GPU

        @returns sents_var: tensor of (max_sentence_length, batch_size, max_word_length)
        """
        ###     Connect `words2charindices()` and `pad_sents_char()` which you've defined in
        ###     previous parts

        #print('sents: {}'.format(sents))
        batch_size, max_sentence_length = len(sents), max(
            [len(s) for s in sents])
        word_ids = self.words2charindices(sents)
        word_ids_padded = pad_sents_char(
            word_ids, self.char2id['<pad>']
        )  # shape = (batch_size, max_sentence_length, max_word_length)
        max_word_length = len(word_ids_padded[0][0])
        #print('word_ids_padded: {}'.format(word_ids_padded))
        assert_expected_size(
            torch.tensor(word_ids_padded, device=device), 'word_ids_padded',
            [batch_size, max_sentence_length, max_word_length])

        #sents_var = torch.reshape(torch.tensor(word_ids_padded), (max_sentence_length, batch_size, max_word_length))
        sents_var = torch.tensor(word_ids_padded,
                                 device=device).permute(1, 0, 2)
        assert_expected_size(
            sents_var, 'sents_var',
            [max_sentence_length, batch_size, max_word_length])
        #print('sents_var: {}'.format(sents_var))
        return sents_var.contiguous()
コード例 #6
0
    def to_input_tensor_char(self, sents: List[List[str]],
                             device: torch.device) -> torch.Tensor:
        """ Convert list of sentences (words) into tensor with necessary padding for 
        shorter sentences.

        @param sents (List[List[str]]): list of sentences (words)
        @param device: device on which to load the tensor, i.e. CPU or GPU

        @returns sents_var: tensor of (max_sentence_length, batch_size, max_word_length)
        """
        ### YOUR CODE HERE for part 1g
        ### TODO:
        ###     Connect `words2charindices()` and `pad_sents_char()` which you've defined in
        ###     previous parts
        # max_sentence_length = max([len(s) for s in sents])
        # batch_size = len(sents)
        # max_word_length = 21

        char_ids = self.words2charindices(sents)
        # sents_t = pad_sents_char(char_ids, self['<pad>'])
        sents_t = pad_sents_char(char_ids, self.char2id['<pad>'])
        #(batch_size, max_sentence_length, max_word_length)
        sents_var = torch.tensor(sents_t, dtype=torch.long, device=device)

        # print("=== 1(g) to_input_tensor_char, sents_var = ", sents_var.size())
        # return sents_var.permute(1, 0, 2)

        # return sents_var.contiguous().view(sents_var.size()[1], sents_var.size()[0], sents_var.size()[2])
        return sents_var.permute(1, 0, 2).contiguous()
コード例 #7
0
    def to_input_tensor_char(self, sents: List[List[str]],
                             device: torch.device) -> torch.Tensor:
        """ Convert list of sentences (words) into tensor with necessary padding for 
        shorter sentences.

        @param sents (List[List[str]]): list of sentences (words)
        @param device: device on which to load the tensor, i.e. CPU or GPU

        @returns sents_var: tensor of (max_sentence_length, batch_size, max_word_length)
        """
        ### YOUR CODE HERE for part 1g
        ### TODO:
        ###     Connect `words2charindices()` and `pad_sents_char()` which you've defined in
        ###     previous parts
        char_indices = self.words2charindices(sents)
        pad_token = self.word2id['<pad>']
        padded_char_indices = pad_sents_char(char_indices, pad_token)
        sents_var = torch.tensor(padded_char_indices,
                                 dtype=torch.long,
                                 device=device)

        sents_var = sents_var.permute(1, 0, 2)
        """
        # shape (max_sentence_length, batch_size, max_word_length)
        print(len(padded_char_indices))
        print(len(padded_char_indices[0]))
        print(len(padded_char_indices[0][0]))
        print(sents_var.shape)
        assert sents_var.shape[0] == len(padded_char_indices[0]) 
        assert sents_var.shape[1] == len(sents) 
        assert sents_var.shape[2] == 21
        print("assertions success!")
        """
        return sents_var
コード例 #8
0
ファイル: vocab.py プロジェクト: atlasbc/cs224n_2019
    def to_input_tensor_char(self, sents: List[List[str]],
                             device: torch.device) -> torch.Tensor:
        """ Convert list of sentences (words) into tensor with necessary padding for 
        shorter sentences.

        @param sents (List[List[str]]): list of sentences (words)
        @param device: device on which to load the tensor, i.e. CPU or GPU

        @returns sents_var: tensor of (max_sentence_length, batch_size, max_word_length)
        """
        ### YOUR CODE HERE for part 1g
        ### TODO:
        ###     Connect `words2charindices()` and `pad_sents_char()` which you've defined in
        ###     previous parts

        # turn words into list of character indices
        sents = self.words2charindices(sents)

        #pad sentences and words
        sents_padded = pad_sents_char(sents, self.char2id['<pad>'])

        # pad_sents_char > (batch_size, max_sentence_length, max_word_length)
        # adjust dimensions
        sents_var = torch.tensor(sents_padded, dtype=torch.long, device=device)
        sents_var = sents_var.permute(1, 0, 2)
        return sents_var
コード例 #9
0
def question_1b_sanity_check():
    """ Sanity check for pad_sents_char() function.
    """
    print("-" * 80)
    print("Running Sanity Check for Question 1b: Padding")
    print("-" * 80)
    vocab = VocabEntry()

    print("Running test on a list of sentences")
    sentences = [['Human:', 'What', 'do', 'we', 'want?'],
                 ['Computer:', 'Natural', 'language', 'processing!'],
                 ['Human:', 'When', 'do', 'we', 'want', 'it?'],
                 ['Computer:', 'When', 'do', 'we', 'want', 'what?']]
    word_ids = vocab.words2charindices(sentences)

    padded_sentences = pad_sents_char(word_ids, 0)
    gold_padded_sentences = torch.load(
        './sanity_check_en_es_data/gold_padded_sentences.pkl')
    assert padded_sentences == gold_padded_sentences, "Sentence padding is incorrect: it should be:\n {} but is:\n{}".format(
        gold_padded_sentences, padded_sentences)
    print(
        len(padded_sentences),
        torch.transpose(torch.tensor(padded_sentences, dtype=torch.int), 0,
                        1).shape)

    print("Sanity Check Passed for Question 1b: Padding!")
    print("-" * 80)
    def to_input_tensor_char(self, sents: List[List[str]],
                             device: torch.device) -> torch.Tensor:
        """ Convert list of sentences (words) into tensor with necessary padding for 
        shorter sentences.

        @param sents (List[List[str]]): list of sentences (words)
        @param device: device on which to load the tensor, i.e. CPU or GPU

        @returns sents_var: tensor of (max_sentence_length, batch_size, max_word_length)
        """
        ### YOUR CODE HERE for part 1g
        ### TODO:
        ###     Connect `words2charindices()` and `pad_sents_char()` which you've defined in
        ###     previous parts

        word_ids = self.words2charindices(sents)
        sents_padded = pad_sents_char(word_ids, self.char2id['<pad>'])

        # this if of shape (batch_size, max_sentence_length, max_word_length)
        sents_padded_tensor = torch.tensor(sents_padded,
                                           dtype=torch.long,
                                           device=device)

        # need to make it (max_sentence_length, batch_size, max_word_length)
        # adding .contiguous() otherwise code was failing lin line nmt_model line #128

        sents_var = sents_padded_tensor.permute(1, 0, 2).contiguous()

        return sents_var
コード例 #11
0
    def to_input_tensor_char(self, sents: List[List[str]],
                             device: torch.device) -> torch.Tensor:
        """ Convert list of sentences (words) into tensor with necessary padding for 
        shorter sentences.

        @param sents (List[List[str]]): list of sentences (words)
        @param device: device on which to load the tensor, i.e. CPU or GPU

        @returns sents_var: tensor of (max_sentence_length, batch_size, max_word_length)

        """

        # This is to convert each character in all words to its corresponding index in the character-vocabulary
        # The elements in the list are not uniform in length nor padded
        sents = self.words2charindices(sents)

        # Apply pad_sents_char with the pad id
        # This function pads all words and sentences to their maximum lengths
        sents = pad_sents_char(sents, self.char2id['<pad>'])

        # Transform the results above as torch objects with device
        res_sents = torch.tensor(sents, dtype=torch.long, device=device)

        # To switch the first and second dimension
        res_sents = torch.transpose(res_sents, 0, 1)
        return res_sents
コード例 #12
0
ファイル: sanity_check.py プロジェクト: arkhalid/XCS224N-A5
def question_1c_sanity_check():
    """ Sanity check for to_input_tensor_char() function.
    """
    print("-" * 80)
    print("Running Sanity Check for Question 1c: To input tensor")
    print("-" * 80)
    vocab = VocabEntry()

    print("Running test on a list of sentences")
    sentences = [['Human:', 'What', 'do', 'we', 'want?'], ['Computer:', 'Natural', 'language', 'processing!'],
                 ['Human:', 'When', 'do', 'we', 'want', 'it?'], ['Computer:', 'When', 'do', 'we', 'want', 'what?']]
    word_ids = vocab.words2charindices(sentences)

    padded_sentences = pad_sents_char(word_ids, 0)
    gold_padded_sentences = torch.load('./sanity_check_en_es_data/gold_padded_sentences.pkl')
    assert padded_sentences == gold_padded_sentences, "Sentence padding is incorrect: it should be:\n {} but is:\n{}".format(
        gold_padded_sentences, padded_sentences)

    batch_size = len(gold_padded_sentences)
    max_sentence_length = len(gold_padded_sentences[0])
    max_word_length = len(gold_padded_sentences[0][0])

    padded_sentences_tensor = vocab.to_input_tensor_char(sentences, device=torch.device('cpu'))

    assert (padded_sentences_tensor.size() == (max_sentence_length, batch_size, max_word_length))

    print("Sanity Check Passed for Question 1c: To input tensor")
    print("-" * 80)
コード例 #13
0
    def to_input_tensor_char(self, sents: List[List[str]],
                             device: torch.device) -> torch.Tensor:
        """ Convert list of sentences (words) into tensor with necessary padding for 
        shorter sentences.

        @param sents (List[List[str]]): list of sentences (words)
        @param device: device on which to load the tensor, i.e. CPU or GPU

        @returns sents_var: tensor of (max_sentence_length, batch_size, max_word_length)
        """
        ### YOUR CODE HERE for part 1c
        ### TODO:
        ###     Connect `words2charindices()` and `pad_sents_char()` which you've defined in
        ###     previous parts

        words_ids = self.words2charindices(sents)
        padded_sents_chars = pad_sents_char(
            words_ids, char_pad_token=self.char2id['<pad>'])
        sents_var = torch.tensor(padded_sents_chars,
                                 dtype=torch.long,
                                 device=device)

        sents_var_tr = torch.transpose(
            sents_var, 0, 1
        )  # Ensure you reshape the dimensions so that the output has shape: (max sentence length, batch size, max word length)

        return sents_var_tr
コード例 #14
0
    def to_input_tensor_char(self, sents: List[List[str]], device: torch.device) -> torch.Tensor:
        """ Convert list of sentences (words) into tensor with necessary padding for 
        shorter sentences.

        @param sents (List[List[str]]): list of sentences (words)
        @param device: device on which to load the tensor, i.e. CPU or GPU

        @returns sents_var: tensor of (max_sentence_length, batch_size, max_word_length)
        """
        ### YOUR CODE HERE for part 1g
        ### TODO: 
        ###     Connect `words2charindices()` and `pad_sents_char()` which you've defined in 
        ###     previous parts줌

        # character들을 int형으로 바꿈 : word_ids (list[list[list[int]]])
        word_ids = self.words2charindices(sents)
        # 길이에 맞게 padding함 : sents_padded (list[list[list[int]]])
        #                                   => (batch_size, max_sentence_length, max_word_length)
        sents_padded = pad_sents_char(word_ids, self.char2id['<pad>'])

        sents_padded_tensor = torch.tensor(sents_padded, dtype=torch.long, device=device)

        sents_var = torch.transpose(sents_padded_tensor, 0, 1)

        return sents_var
コード例 #15
0
ファイル: vocab.py プロジェクト: liushui9404/gap-coreference
    def to_input_tensor_char(self, sents: List[List[str]], device: torch.device) -> torch.Tensor:
        """ Convert list of sentences (words) into tensor with necessary padding for 
        shorter sentences.

        @param sents (List[List[str]]): list of sentences (words)
        @param device: device on which to load the tensor, i.e. CPU or GPU

        @returns sents_var: tensor of (max_sentence_length, batch_size, max_word_length)
        """
        ### YOUR CODE HERE for part 1g
        ### TODO: 
        ###     Connect `words2charindices()` and `pad_sents_char()` which you've defined in 
        ###     previous parts
        words2char_sents = self.words2charindices(sents)
        padded_sents = pad_sents_char(words2char_sents, self.word2id['<pad>'])
        max_sentence_length = max(len(s) for s in sents)
        max_word_length = 21
        sents_var = torch.tensor(padded_sents, dtype=torch.long, device=device)
        sents_var = sents_var.permute(1, 0, 2).contiguous()
        # print("sentence:")
        # print(sents_var.size())
        # print("dims")
        # print((max_sentence_length, max_word_length))
        # print(sents_var.size())
        # print("done")
        return sents_var
コード例 #16
0
    def to_input_tensor_char(self, sents: List[List[str]],
                             device: torch.device) -> torch.Tensor:
        """ Convert list of sentences (words) into tensor with necessary padding for 
        shorter sentences.

        @param sents (List[List[str]]): list of sentences (words)
        @param device: device on which to load the tensor, i.e. CPU or GPU

        @returns sents_var: tensor of (max_sentence_length, batch_size, max_word_length)
        """
        ### YOUR CODE HERE for part 1g
        ### TODO:
        ###     Connect `words2charindices()` and `pad_sents_char()` which you've defined in
        ###     previous parts
        word_ids = self.words2charindices(sents)

        # of shape (batch_size, max_sentence_length, max_word_length), change to specificed device(cpu or gpu)
        sents_var = torch.tensor(pad_sents_char(word_ids,
                                                self.char2id['<pad>']),
                                 dtype=torch.long,
                                 device=device)

        # switch to (max_sentence_length, batch_size, max_word_length)
        sents_var = sents_var.permute(1, 0, 2)

        ### END YOUR CODE
        return sents_var
コード例 #17
0
ファイル: vocab.py プロジェクト: eric07109/xcs224n_a5
    def to_input_tensor_char(self, sents: List[List[str]],
                             device: torch.device) -> torch.Tensor:
        """ Convert list of sentences (words) into tensor with necessary padding for 
        shorter sentences.

        @param sents (List[List[str]]): list of sentences (words)
        @param device: device on which to load the tensor, i.e. CPU or GPU

        @returns sents_var: tensor of (max_sentence_length, batch_size, max_word_length)
        """
        ### YOUR CODE HERE for part 1c
        ### TODO:
        ###     Connect `words2charindices()` and `pad_sents_char()` which you've defined in
        ###     previous parts

        sents2int = self.words2charindices(sents)
        sent_padded = torch.tensor(
            pad_sents_char(sents2int, self.char2id['<pad>']))
        batch_size = sent_padded.shape[0]
        max_sentence_length = sent_padded.shape[1]
        max_word_length = sent_padded.shape[2]
        print('Tensor shape BEFORE reshape: {}'.format(sent_padded.shape))
        print('.' * 80)
        sent_padded = sent_padded.view(max_sentence_length, batch_size,
                                       max_word_length)
        print('Tensor shape AFTER reshape: {}'.format(sent_padded.shape))
        return sent_padded
コード例 #18
0
def question_1f_sanity_check():
    """ Sanity check for pad_sents_char() function. 
    """
    print ("-"*80)
    print("Running Sanity Check for Question 1f: Padding")
    print ("-"*80)
    vocab = VocabEntry()

    print("Running test on a list of sentences")
    sentences = [['Human:', 'What', 'do', 'we', 'want?'], ['Computer:', 'Natural', 'language', 'processing!'], ['Human:', 'When', 'do', 'we', 'want', 'it?'], ['Computer:', 'When', 'do', 'we', 'want', 'what?']]
    word_ids = vocab.words2charindices(sentences)

    padded_sentences = pad_sents_char(word_ids, 0)
    gold_padded_sentences = torch.load('./sanity_check_en_es_data/gold_padded_sentences.pkl')
    assert padded_sentences == gold_padded_sentences, "Sentence padding is incorrect: it should be:\n {} but is:\n{}".format(gold_padded_sentences, padded_sentences)

    test_list = [[[4]*33]]
    padded_sent = pad_sents_char(test_list, 0)
    assert len(padded_sent[0][0]) == 21
    print("Sanity Check Passed for Question 1f: Padding!")
    print("-"*80)
    def to_input_tensor_char(self, sents: List[List[str]],
                             device: torch.device) -> torch.Tensor:
        """ Convert list of sentences (words) into tensor with necessary padding for 
        shorter sentences.

        @param sents (List[List[str]]): list of sentences (words)
        @param device: device on which to load the tensor, i.e. CPU or GPU

        @returns sents_var: tensor of (max_sentence_length, batch_size, max_word_length)
        """
        sents = self.words2charindices(sents)
        sents = pad_sents_char(sents, self.char2id['<pad>'])
        return torch.tensor(sents).permute(1, 0, 2).to(device)
コード例 #20
0
ファイル: vocab.py プロジェクト: t6am3/cs224n-2019
    def to_input_tensor_char(self, sents: List[List[str]], device: torch.device) -> torch.Tensor:
        """ Convert list of sentences (words) into tensor with necessary padding for 
        shorter sentences.

        @param sents (List[List[str]]): list of sentences (words)
        @param device: device on which to load the tensor, i.e. CPU or GPU

        @returns sents_var: tensor of (max_sentence_length, batch_size, max_word_length)
        """
        ### YOUR CODE HERE for part 1f
        ### TODO: 
        ###     Connect `words2charindices()` and `pad_sents_char()` which you've defined in 
        ###     previous parts
        return torch.tensor(pad_sents_char(self.words2charindices(sents), self.char2id['<pad>'])).permute(1,0,2).to(device)
コード例 #21
0
def question_1c_sanity_check():
    print("-" * 80)
    print("Running Sanity Check for Question 1c: Input tensor")
    print("-" * 80)
    vocab = VocabEntry()

    print("Running test on a list of sentences")
    sentences = [['Human:', 'What', 'do', 'we', 'want?'],
                 ['Computer:', 'Natural', 'language', 'processing!'],
                 ['Human:', 'When', 'do', 'we', 'want', 'it?'],
                 ['Computer:', 'When', 'do', 'we', 'want', 'what?']]
    word_ids = vocab.words2charindices(sentences)

    padded_sentences = pad_sents_char(word_ids, 0)
    o_tnsr = vocab.to_input_tensor_char(sentences, "cpu")
    print(o_tnsr.shape)
コード例 #22
0
    def to_input_tensor_char(self, sents: List[List[str]],
                             device: torch.device) -> torch.Tensor:
        """ Convert list of sentences (words) into tensor with necessary padding for 
        shorter sentences.

        @param sents (List[List[str]]): list of sentences (words)
        @param device: device on which to load the tensor, i.e. CPU or GPU

        @returns sents_var: tensor of (max_sentence_length, batch_size, max_word_length)
        """
        ### YOUR CODE HERE for part 1g
        char_indices = self.words2charindices(sents)
        padded_chars = pad_sents_char(char_indices, self.char2id['<pad>'])
        sents_var = torch.tensor(padded_chars, dtype=torch.long, device=device)
        sents_var = sents_var.permute(1, 0, 2).contiguous()
        return sents_var
コード例 #23
0
    def test_question_1f_sanity_check(self):
        """ Sanity check for pad_sents_char() function.
        """
        vocab = VocabEntry()

        print("Running test on a list of sentences")
        sentences = [['Human:', 'What', 'do', 'we', 'want?'], ['Computer:', 'Natural', 'language', 'processing!'], ['Human:', 'When', 'do', 'we', 'want', 'it?'], ['Computer:', 'When', 'do', 'we', 'want', 'what?']]
        word_ids = vocab.words2charindices(sentences)

        padded_sentences = pad_sents_char(word_ids, 0)
        gold_padded_sentences = torch.load('./sanity_check_en_es_data/gold_padded_sentences.pkl')
        assert len(gold_padded_sentences) == len(padded_sentences)
        for expected, got in zip(gold_padded_sentences, padded_sentences):
            if got != expected:
                raise AssertionError('got {}: expected: {}'.format(got, expected))
        assert padded_sentences == gold_padded_sentences, "Sentence padding is incorrect: it should be:\n {} but is:\n{}".format(gold_padded_sentences, padded_sentences)
コード例 #24
0
ファイル: vocab.py プロジェクト: alekjedrosz/pl-en-nmt
    def to_input_tensor_char(self, sents: List[List[str]],
                             device: torch.device) -> torch.Tensor:
        """ Convert list of sentences (words) into tensor with necessary padding for 
        shorter sentences.

        :param List[List[str]] sents: list of sentences comprising of words.
        :param device: device on which to load the tensor, i.e. CPU or GPU.

        :return sents_var: padded tensor of sentences at the character level,
        shaped (max_sentence_length, batch_size, max_word_length).
        """
        char_indices = self.words2charindices(sents)
        sents_var = pad_sents_char(char_indices, self.char2id['<pad>'])
        sents_var = torch.tensor(sents_var, device=device)
        sents_var = sents_var.permute(1, 0, 2)

        return sents_var
コード例 #25
0
    def to_input_tensor(self, sents: List[List[str]],
                        device: torch.device) -> torch.Tensor:
        """ Convert list of sentences (words) into tensor with necessary padding for 
        shorter sentences.

        @param sents (List[List[str]]): list of sentences (words)
        @param device: device on which to load the tesnor, i.e. CPU or GPU

        @returns sents_var: tensor of (max_sentence_length, batch_size)
        word_ids = self.words2indices(sents)
        sents_t = pad_sents(word_ids, self['<pad>'])
        sents_var = torch.tensor(sents_t, dtype=torch.long, device=device)
        return torch.t(sents_var)
        """
        indices = pad_sents_char(self.words2charindices(sents), self['<pad>'])
        sents_var = torch.LongTensor(indices).permute(1, 0, 2).to(device)
        return sents_var
コード例 #26
0
    def to_input_tensor_char(self, sents: List[List[str]], device: torch.device) -> torch.Tensor:
        """ Convert list of sentences (words) into tensor with necessary padding for 
        shorter sentences.

        @param sents (List[List[str]]): list of sentences (words)
        @param device: device on which to load the tensor, i.e. CPU or GPU

        @returns sents_var: tensor of (max_sentence_length, batch_size, max_word_length)
        """
        ### YOUR CODE HERE for part 1g
        ### TODO: 
        # print(sents)
        char_ids = self.words2charindices(sents)
        sents_t = pad_sents_char(char_ids,self.char2id['<pad>'])
        # print(sents_t)
        sents_var = torch.tensor(sents_t,dtype=torch.long,device=device)
        return torch.transpose(sents_var,0,1)     # torch.t()将张量前两维转置
コード例 #27
0
ファイル: vocab.py プロジェクト: rbolline/Stanford-Assigments
    def to_input_tensor_char(self, sents: List[List[str]], device: torch.device) -> torch.Tensor:
        """ Convert list of sentences (words) into tensor with necessary padding for 
        shorter sentences.

        @param sents (List[List[str]]): list of sentences (words)
        @param device: device on which to load the tensor, i.e. CPU or GPU

        @returns sents_var: tensor of (max_sentence_length, batch_size, max_word_length)
        """
        ### YOUR CODE HERE for part 1c
        ### TODO: 
        ###     Connect `words2charindices()` and `pad_sents_char()` which you've defined in 
        ###     previous parts
        sents_t = pad_sents_char(self.words2charindices(sents), self['<pad>'])
        sents_var = torch.tensor(sents_t, dtype=torch.long, device=device)
        svs = sents_var.shape
        return torch.reshape(sents_var, (svs[1], svs[0], svs[2]))
コード例 #28
0
ファイル: vocab.py プロジェクト: schlicht/stanford-nlp
    def to_input_tensor_char(self, sents: List[List[str]], device: torch.device) -> torch.Tensor:
        """ Convert list of sentences (words) into tensor with necessary padding for
        shorter sentences.

        @param sents (List[List[str]]): list of sentences (words)
        @param device: device on which to load the tensor, i.e. CPU or GPU

        @returns sents_var: tensor of (max_sentence_length, batch_size, max_word_length)
        """
        ### YOUR CODE HERE for part 1g
        ### TODO:
        ###     Connect `words2charindices()` and `pad_sents_char()` which you've defined in
        ###     previous parts
        wordIndices = self.words2charindices(sents)
        wordIndicesPadded = pad_sents_char(wordIndices, char_pad_token=self.char2id['<pad>'])
        sents_var = torch.tensor(wordIndicesPadded, device=device)
        return sents_var.transpose(0, 1)
コード例 #29
0
    def to_input_tensor_char(self, sents: List[List[str]], device: torch.device) -> torch.Tensor:
        """ Convert list of sentences (words) into tensor with necessary padding for 
        shorter sentences.

        @param sents (List[List[str]]): list of sentences (words)
        @param device: device on which to load the tensor, i.e. CPU or GPU

        @returns sents_var: tensor of (max_sentence_length, batch_size, max_word_length)
        """
        ### YOUR CODE HERE for part 1c
        ### TODO: 
        ###     Connect `words2charindices()` and `pad_sents_char()` which you've defined in 
        ###     previous parts
        sents_t = pad_sents_char(self.words2charindices(sents), self.char2id['<pad>'])
        # self.max_vocab_tokens_in_worlad = len(sents_t[0][0])
        # assert self.max_vocab_tokens_in_word == 21
        sents_var = torch.tensor(sents_t, dtype=torch.long, device=device)
        return sents_var.permute(1, 0, 2)
コード例 #30
0
ファイル: vocab.py プロジェクト: MichalPitr/CS224n-2019
    def to_input_tensor_char(self, sents: List[List[str]], device: torch.device) -> torch.Tensor:
        """ Convert list of sentences (words) into tensor with necessary padding for 
        shorter sentences.

        @param sents (List[List[str]]): list of sentences (words)
        @param device: device on which to load the tensor, i.e. CPU or GPU

        @returns sents_var: tensor of (max_sentence_length, batch_size, max_word_length)
        """
        ### YOUR CODE HERE for part 1g
        ### TODO: 
        ###     Connect `words2charindices()` and `pad_sents_char()` which you've defined in 
        ###     previous parts
        char_ids = self.words2charindices(sents) #out= list[list[list[int]]]
        char_ids_padded = pad_sents_char(char_ids, char_pad_token=0) # out = list[[maxsent_len x maxwordlen]]
        sents = torch.tensor(char_ids_padded, dtype=torch.long, device=device) #TODO does it need long? probably not
        sents_var = sents.permute(1, 0, 2)
        return sents_var