Beispiel #1
0
def question_1g_sanity_check():
    """ Sanity check for to input tensor char() function.
    """
    print("-" * 80)
    print("Running Sanity Check for Question 1g: Creating Input Tensor")
    print("-" * 80)
    vocab = VocabEntry()

    print("Running test on a list of sentences")
    sentences = [['Human:', 'What', 'do', 'we', 'want?'],
                 ['Computer:', 'Natural', 'language', 'processing!'],
                 ['Human:', 'When', 'do', 'we', 'want', 'it?'],
                 ['Computer:', 'When', 'do', 'we', 'want', 'what?']]
    device = torch.device("cpu")
    max_sentence_length = max([len(sent) for sent in sentences])
    max_word_length = 21

    input_tensor = vocab.to_input_tensor_char(sentences, device)
    print(input_tensor)
    batch_size = len(sentences)
    correct_shape = [max_sentence_length, batch_size, max_word_length]
    actual_shape = list(input_tensor.size())
    assert actual_shape == correct_shape, "Input Tensor Creation is incorrect: it should be \n{} but is:{}".format(
        correct_shape, input_tensor.size())
    print("Sanity Check Passed for Question 1g: Creating Input Tensor!")
    print("-" * 80)
Beispiel #2
0
def question_1e_sanity_check():
    """ Sanity check for to_input_tensor_char() function.
    """
    print("-" * 80)
    print("Running Sanity Check for Question 1e: To Input Tensor Char")
    print("-" * 80)
    vocabEntry = VocabEntry()

    print("Running test on a list of sentences")
    sentences = [['Human', ':', 'What', 'do', 'we', 'want', '?'],
                 ['Computer', ':', 'Natural', 'language', 'processing', '!'],
                 ['Human', ':', 'When', 'do', 'we', 'want', 'it', '?'],
                 ['Computer', ':', 'When', 'do', 'we', 'want', 'what', '?']]
    sentence_length = 8
    BATCH_SIZE = 4
    word_length = 12
    output = vocabEntry.to_input_tensor_char(sentences, 'cpu')
    output_expected_size = [sentence_length, BATCH_SIZE, word_length]
    assert list(
        output.size()
    ) == output_expected_size, "output shape is incorrect: it should be:\n {} but is:\n{}".format(
        output_expected_size, list(output.size()))

    print("Sanity Check Passed for Question 1e: To Input Tensor Char!")
    print("-" * 80)
Beispiel #3
0
def question_1g_sanity_check(model):
    """ Sanity check for pad_sents_char() function. 
    """
    print("-" * 80)
    print("Running Sanity Check for Question 1g: Padding")
    print("-" * 80)
    vocab = VocabEntry()

    print("Running test on a list of sentences")
    sentences = [['Human:', 'What', 'do', 'we', 'want?'],
                 ['Computer:', 'Natural', 'language', 'processing!'],
                 ['Human:', 'When', 'do', 'we', 'want', 'it?'],
                 ['Computer:', 'When', 'do', 'we', 'want', 'what?']]
    word_ids = vocab.words2charindices(sentences)

    #padded_sentences = pad_sents_char(word_ids, 0)
    padded_sentences = vocab.to_input_tensor_char(sentences, model.device)
    gold_padded_sentences = torch.load(
        './sanity_check_en_es_data/gold_padded_sentences.pkl')

    a = torch.rand(6, 4, 21)
    print(a.size())
    print(padded_sentences.size())
    assert padded_sentences.size() == a.size(
    ), "to_input_tensor size incorrect! is incorrect: it should be:\n {} but is:\n{}".format(
        a.size(), padded_sentences.size())

    print("Sanity Check Passed for Question 1g: Padding!")
    print("-" * 80)
Beispiel #4
0
def question_1e_sanity_check():
    """Sanity check for to_input_tensor_char() function."""
    print("-" * 80)
    print("Running Sanity Check for Question 1e: To Input Tensor Char")
    print("-" * 80)
    vocabEntry = VocabEntry()

    print("Running test on a list of sentences")
    sentences = [
        ["Human", ":", "What", "do", "we", "want", "?"],
        ["Computer", ":", "Natural", "language", "processing", "!"],
        ["Human", ":", "When", "do", "we", "want", "it", "?"],
        ["Computer", ":", "When", "do", "we", "want", "what", "?"],
    ]
    sentence_length = 8
    BATCH_SIZE = 4
    word_length = 12
    output = vocabEntry.to_input_tensor_char(sentences, "cpu")
    output_expected_size = [sentence_length, BATCH_SIZE, word_length]
    assert (
        list(output.size()) == output_expected_size
    ), "output shape is incorrect: it should be:\n {} but is:\n{}".format(
        output_expected_size, list(output.size()))

    print("Sanity Check Passed for Question 1e: To Input Tensor Char!")
    print("-" * 80)
Beispiel #5
0
def question_1a_sanity_check():
    """ Sanity check for words2charindices function.
    """
    print("-" * 80)
    print("Running Sanity Check for Question 1a: words2charindices()")
    print("-" * 80)
    vocab = VocabEntry()

    print('Running test on small list of sentences')
    sentences = [["a", "b", "c?"], ["~d~", "c", "b", "a"]]
    small_ind = vocab.words2charindices(sentences)
    small_ind_gold = [[[1, 30, 2], [1, 31, 2], [1, 32, 70, 2]],
                      [[1, 85, 33, 85, 2], [1, 32, 2], [1, 31, 2], [1, 30, 2]]]
    assert (small_ind == small_ind_gold), \
        "small test resulted in indices list {:}, expected {:}".format(small_ind, small_ind_gold)

    print('Running test on large list of sentences')
    tgt_sents = [
        ['<s>', "Let's", 'start', 'by', 'thinking', 'about', 'the', 'member', 'countries', 'of', 'the', 'OECD,', 'or',
         'the', 'Organization', 'of', 'Economic', 'Cooperation', 'and', 'Development.', '</s>'],
        ['<s>', 'In', 'the', 'case', 'of', 'gun', 'control,', 'we', 'really', 'underestimated', 'our', 'opponents.',
         '</s>'],
        ['<s>', 'Let', 'me', 'share', 'with', 'those', 'of', 'you', 'here', 'in', 'the', 'first', 'row.', '</s>'],
        ['<s>', 'It', 'suggests', 'that', 'we', 'care', 'about', 'the', 'fight,', 'about', 'the', 'challenge.', '</s>'],
        ['<s>', 'A', 'lot', 'of', 'numbers', 'there.', 'A', 'lot', 'of', 'numbers.', '</s>']]
    tgt_ind = vocab.words2charindices(tgt_sents)
    tgt_ind_gold = pickle.load(open('./sanity_check_en_es_data/1a_tgt.pkl', 'rb'))
    assert (tgt_ind == tgt_ind_gold), "target vocab test resulted in indices list {:}, expected {:}".format(tgt_ind,
                                                                                                            tgt_ind_gold)

    print("All Sanity Checks Passed for Question 1a: words2charindices()!")
    print("-" * 80)
Beispiel #6
0
def question_1g_sanity_check():
    """ Sanity check for to_input_tensor_char() function.
    """
    print("-" * 80)
    print("Running Sanity Check for Question 1g: to_input_tensor_char")
    print("-" * 80)
    vocab = VocabEntry()

    print("Running test on a list of sentences")
    sentences = [['Human:', 'What', 'do', 'we', 'want?'],
                 ['Computer:', 'Natural', 'language', 'processing!'],
                 ['Human:', 'When', 'do', 'we', 'want', 'it?'],
                 ['Computer:', 'When', 'do', 'we', 'want', 'what?']]

    X = vocab.to_input_tensor_char(sentences, "cpu")

    # 6 is the max_sentence_length
    # 4 is batch size
    # 21 is max_word_length
    assert X.shape == (
        6, 4,
        21), f"Size is incorrect: it should be (6, 4, 21) but it is {X.shape}"

    print("Sanity Check Passed for Question 1g: to_input_tensor_char!")
    print("-" * 80)
Beispiel #7
0
def question_1b_sanity_check():
    """ Sanity check for pad_sents_char() function.
    """
    print("-" * 80)
    print("Running Sanity Check for Question 1b: Padding")
    print("-" * 80)
    vocab = VocabEntry()

    print("Running test on a list of sentences")
    sentences = [['Human:', 'What', 'do', 'we', 'want?'],
                 ['Computer:', 'Natural', 'language', 'processing!'],
                 ['Human:', 'When', 'do', 'we', 'want', 'it?'],
                 ['Computer:', 'When', 'do', 'we', 'want', 'what?']]
    word_ids = vocab.words2charindices(sentences)

    padded_sentences = pad_sents_char(word_ids, 0)
    gold_padded_sentences = torch.load(
        './sanity_check_en_es_data/gold_padded_sentences.pkl')
    assert padded_sentences == gold_padded_sentences, "Sentence padding is incorrect: it should be:\n {} but is:\n{}".format(
        gold_padded_sentences, padded_sentences)
    print(
        len(padded_sentences),
        torch.transpose(torch.tensor(padded_sentences, dtype=torch.int), 0,
                        1).shape)

    print("Sanity Check Passed for Question 1b: Padding!")
    print("-" * 80)
def question_1g_test():
    """ Custom simple test for to_input_tensor_char() function. 
    """
    print("-" * 80)
    print("Running Sanity Check for Question 1g: Padding")
    print("-" * 80)
    vocab = VocabEntry()

    print("Running test on a list of sentences")
    sentences = [['Human:', 'What', 'do', 'we', 'want?'],
                 ['Computer:', 'Natural', 'language', 'processing!'],
                 ['Human:', 'When', 'do', 'we', 'want', 'it?'],
                 ['Computer:', 'When', 'do', 'we', 'want', 'what?']]
    gold_shape = torch.Size(
        [6, 4, 21])  # (max sentence length, batch size, max word length)

    device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
    input_tensor = vocab.to_input_tensor_char(sentences, device)

    # print("We get torch tensor:\n", input_tensor)
    assert input_tensor.shape == gold_shape, "Ouput tensor shape is incorrect: it should be:\n {} but is:\n{}".format(
        gold_shape, input_tensor.shape)

    print("Sanity Check Passed for Question 1g: Padding!")
    print("-" * 80)
Beispiel #9
0
def load_word2vec(fpath: str, vocab: VocabEntry,
                  device: torch.device) -> torch.tensor:
    """load pretrained embedding vectors for words in vocab.
    :param fpath : word2vec file(from fasttext) path, in which already contains </s> token.
    :param vocab : constructed vocabulary
    :return word2vec (vocab_size, embed_size): tensor of word2vec
    """
    print("loading pretrained word2vec from %s......" % fpath)
    model = KeyedVectors.load_word2vec_format(fpath, limit=int(1e5))
    words = vocab.get_words()
    word2vec = []
    for w in tqdm(words, desc='loading'):
        try:
            word2vec.append(model[w].astype(np.float))
        except KeyError:
            if w == vocab.get_pad_info(0):
                # initialize pad token with zero vector
                word2vec.append(np.zeros(model.vector_size, dtype=np.float))
            else:
                uniform_init = 0.1
                word2vec.append(
                    np.random.uniform(low=-uniform_init,
                                      high=uniform_init,
                                      size=model.vector_size).astype(np.float))
    word2vec = np.stack(word2vec, axis=0)
    word2vec = torch.from_numpy(word2vec).to(torch.float).to(device)
    assert word2vec.size(0) == len(
        vocab
    ), "tensor size wrong, first dimention should be equal to vocab size"
    return word2vec
Beispiel #10
0
def question_1h_sanity_check(model):
    """ Sanity check for highway network 
    """
    print("-" * 80)
    print("Running Sanity Check for Question 1h: Padding")
    print("-" * 80)
    vocab = VocabEntry()

    print("Running test on a list of sentences")
    sentences = [['Human:', 'What', 'do', 'we', 'want?'],
                 ['Computer:', 'Natural', 'language', 'processing!'],
                 ['Human:', 'When', 'do', 'we', 'want', 'it?'],
                 ['Computer:', 'When', 'do', 'we', 'want', 'what?']]
    word_ids = vocab.words2charindices(sentences)

    #padded_sentences = pad_sents_char(word_ids, 0)
    padded_sentences = vocab.to_input_tensor_char(sentences, model.device)
    gold_padded_sentences = torch.load(
        './sanity_check_en_es_data/gold_padded_sentences.pkl')

    #Test with batch size 1
    x = torch.rand(1, 1, 21)
    hw = Highway(21, 21, 21, 0.5)
    hw.forward(x)
    #Test with batch size 4

    print(a.size())
    print(padded_sentences.size())
    #assert padded_sentences.size() == a.size(), "to_input_tensor size incorrect! is incorrect: it should be:\n {} but is:\n{}".format(a.size(), padded_sentences.size())

    print("Sanity Check Passed for Question 1h: Padding!")
    print("-" * 80)
Beispiel #11
0
def question_1c_sanity_check():
    """ Sanity check for to_input_tensor_char() function.
    """
    print("-" * 80)
    print("Running Sanity Check for Question 1c: To input tensor")
    print("-" * 80)
    vocab = VocabEntry()

    print("Running test on a list of sentences")
    sentences = [['Human:', 'What', 'do', 'we', 'want?'], ['Computer:', 'Natural', 'language', 'processing!'],
                 ['Human:', 'When', 'do', 'we', 'want', 'it?'], ['Computer:', 'When', 'do', 'we', 'want', 'what?']]
    word_ids = vocab.words2charindices(sentences)

    padded_sentences = pad_sents_char(word_ids, 0)
    gold_padded_sentences = torch.load('./sanity_check_en_es_data/gold_padded_sentences.pkl')
    assert padded_sentences == gold_padded_sentences, "Sentence padding is incorrect: it should be:\n {} but is:\n{}".format(
        gold_padded_sentences, padded_sentences)

    batch_size = len(gold_padded_sentences)
    max_sentence_length = len(gold_padded_sentences[0])
    max_word_length = len(gold_padded_sentences[0][0])

    padded_sentences_tensor = vocab.to_input_tensor_char(sentences, device=torch.device('cpu'))

    assert (padded_sentences_tensor.size() == (max_sentence_length, batch_size, max_word_length))

    print("Sanity Check Passed for Question 1c: To input tensor")
    print("-" * 80)
Beispiel #12
0
def test_file1_method1():
    batch_size = 2
    max_sent_len = 3
    max_word_length = 21
    sentence = [['ciao', 'come', 'staiii'], ['sto', 'bene']]
    v = VocabEntry()
    tens = v.to_input_tensor_char(sentence, torch.device('cpu'))
    assert tens.shape[0] == max_sent_len
    assert tens.shape[1] == batch_size
    assert tens.shape[2] == max_word_length, ''
Beispiel #13
0
    def get_code_change_tensors(self, code_vocab: VocabEntry,
                                action_vocab: VocabEntry,
                                device: torch.device):
        code_tensor_a = code_vocab.to_input_tensor(self.old_code_tokens,
                                                   device)
        code_tensor_b = code_vocab.to_input_tensor(self.new_code_tokens,
                                                   device)

        edit_tensor = action_vocab.to_input_tensor(self.edit_actions, device)

        return code_tensor_a, code_tensor_b, edit_tensor
def question_1g_sanity_check():
    """ Sanity check for to_input() function.
    """
    print ("-"*80)
    print("Running Sanity Check for Question 1g: Reshape")
    print ("-"*80)
    vocab = VocabEntry()

    print("Running test on a list of sentences")
    sentences = [['Human:', 'What', 'do', 'we', 'want?'], ['Computer:', 'Natural', 'language', 'processing!'], ['Human:', 'When', 'do', 'we', 'want', 'it?'], ['Computer:', 'When', 'do', 'we', 'want', 'what?']]
    sent_tensor = vocab.to_input_tensor_char(sentences, "cpu")

    print("Sanity Check Passed for Question 1g: Reshape!")
    print("-"*80)
Beispiel #15
0
def question_1c_sanity_check():
    """ Sanity check for to_input_tensor_char() function.
    """
    print("-" * 80)
    print("Running Sanity Check for Question 1c")
    print("-" * 80)
    vocab = VocabEntry()

    print("Running test on a list of sentences")
    sentences = [['Human:', 'What', 'do', 'we', 'want?'],
                 ['Computer:', 'Natural', 'language', 'processing!'],
                 ['Human:', 'When', 'do', 'we', 'want', 'it?'],
                 ['Computer:', 'When', 'do', 'we', 'want', 'what?']]
    sent_padded = vocab.to_input_tensor_char(sentences, torch.device)
Beispiel #16
0
def question_1g_sanity_check():
    """ Sanity check for to_input_tensor_char() function. 
    """
    print ("-"*80)
    print("Running Sanity Check for Question 1g: Padding")
    print ("-"*80)
    vocab = VocabEntry()

    print("Running test on a list of sentences")
    sentences = [['Human:', 'What', 'do', 'we', 'want?'], ['Computer:', 'Natural', 'language', 'processing!'], ['Human:', 'When', 'do', 'we', 'want', 'it?'], ['Computer:', 'When', 'do', 'we', 'want', 'what?']]
    input_tensor = vocab.to_input_tensor_char(sentences, torch.device('cuda', 0))
    # print(input_tensor.shape)
    assert input_tensor.shape == (6, 4, 21)
    print("Sanity Check Passed for Question 1g: Padding!")
    print("-"*80)
    pass
Beispiel #17
0
def question_1c_sanity_check():
    print("-" * 80)
    print("Running Sanity Check for Question 1c: Input tensor")
    print("-" * 80)
    vocab = VocabEntry()

    print("Running test on a list of sentences")
    sentences = [['Human:', 'What', 'do', 'we', 'want?'],
                 ['Computer:', 'Natural', 'language', 'processing!'],
                 ['Human:', 'When', 'do', 'we', 'want', 'it?'],
                 ['Computer:', 'When', 'do', 'we', 'want', 'what?']]
    word_ids = vocab.words2charindices(sentences)

    padded_sentences = pad_sents_char(word_ids, 0)
    o_tnsr = vocab.to_input_tensor_char(sentences, "cpu")
    print(o_tnsr.shape)
Beispiel #18
0
def question_1g_sanity_check():
    """ Sanity check for pad_sents_char() function.
    """
    print("-" * 80)
    print("Running Sanity Check for Question 1g: question_1g_sanity_check")
    print("-" * 80)
    vocab = VocabEntry()

    print("Running test on a list of sentences")
    sentences = [['Human:', 'What', 'do', 'we', 'want?'],
                 ['Computer:', 'Natural', 'language', 'processing!'],
                 ['Human:', 'When', 'do', 'we', 'want', 'it?'],
                 ['Computer:', 'When', 'do', 'we', 'want', 'what?']]
    t = vocab.to_input_tensor_char(sentences, torch.device('cpu', 0))
    print("Sanity Check Passed for Question 1g:shape=" + str(t.shape))
    print("-" * 80)
    def test_question_1f_sanity_check(self):
        """ Sanity check for pad_sents_char() function.
        """
        vocab = VocabEntry()

        print("Running test on a list of sentences")
        sentences = [['Human:', 'What', 'do', 'we', 'want?'], ['Computer:', 'Natural', 'language', 'processing!'], ['Human:', 'When', 'do', 'we', 'want', 'it?'], ['Computer:', 'When', 'do', 'we', 'want', 'what?']]
        word_ids = vocab.words2charindices(sentences)

        padded_sentences = pad_sents_char(word_ids, 0)
        gold_padded_sentences = torch.load('./sanity_check_en_es_data/gold_padded_sentences.pkl')
        assert len(gold_padded_sentences) == len(padded_sentences)
        for expected, got in zip(gold_padded_sentences, padded_sentences):
            if got != expected:
                raise AssertionError('got {}: expected: {}'.format(got, expected))
        assert padded_sentences == gold_padded_sentences, "Sentence padding is incorrect: it should be:\n {} but is:\n{}".format(gold_padded_sentences, padded_sentences)
Beispiel #20
0
 def sentence_ids_to_multi_ones_hot_vector(
         y: List[str], dictionary: VocabEntry) -> np.array:
     total_length = len(dictionary)
     ones_hot = np.zeros(total_length, dtype=np.int)
     hot_indices = dictionary.words2indices(y)
     ones_hot[hot_indices] = 1
     # ignore the following words '<pad>' '<s>' '</s>' '<unk>'
     return ones_hot[4:]
Beispiel #21
0
def question_1g_sanity_check():
    """ Sanity check for to_input_tensor_char() function. 
    """
    print ("-"*80)
    print("Running Sanity Check for Question 1g: Building the input tensor")
    print ("-"*80)
    vocab = VocabEntry()

    print("Running test on a list of sentences")
    sentences = [['Human:', 'What', 'do', 'we', 'want?'], ['Computer:', 'Natural', 'language', 'processing!'], ['Human:', 'When', 'do', 'we', 'want', 'it?'], ['Computer:', 'When', 'do', 'we', 'want', 'what?']]
    device = torch.device('cpu')
    padded_tensor = vocab.to_input_tensor_char(sentences, device)
    gold_padded_sentences = torch.load('./sanity_check_en_es_data/gold_padded_sentences.pkl')
    gold_padded_tensor = torch.tensor(gold_padded_sentences, device = device).permute(1, 0, 2)
    assert padded_tensor.size() == gold_padded_tensor.size(), "Sentence padding is incorrect: it should be:\n {} but is:\n{}".format(gold_padded_sentences, padded_sentences)

    print("Sanity Check Passed for Question 1g: Building the input tensor!")
    print("-"*80)
Beispiel #22
0
    def get_train_and_dev(self, train_file_path, grammar_file,
                          primitive_types):
        src_freq = 3
        code_freq = 3
        grammar = ASDLGrammar.grammar_from_text(
            open(grammar_file).read(), primitive_types)
        transition_system = TransitionSystem(grammar)
        train_examples = self.preprocess_dataset(train_file_path,
                                                 transition_system)

        full_train_examples = train_examples[:]
        np.random.shuffle(train_examples)
        dev_examples = train_examples[:200]
        train_examples = train_examples[200:]

        src_vocab = VocabEntry.from_corpus(
            [e.sentence for e in train_examples],
            size=5000,
            freq_cutoff=src_freq)
        primitive_tokens = [
            map(
                lambda a: a.action.token,
                filter(lambda a: isinstance(a.action, GenTokenAction),
                       e.tgt_actions)) for e in train_examples
        ]
        primitive_vocab = VocabEntry.from_corpus(primitive_tokens,
                                                 size=5000,
                                                 freq_cutoff=code_freq)

        # generate vocabulary for the code tokens!
        code_tokens = [
            transition_system.tokenize_code(e.code, mode='decoder')
            for e in train_examples
        ]
        code_vocab = VocabEntry.from_corpus(code_tokens,
                                            size=5000,
                                            freq_cutoff=code_freq)

        vocab = Vocab(source=src_vocab,
                      primitive=primitive_vocab,
                      code=code_vocab)

        return train_examples, dev_examples, vocab
Beispiel #23
0
def question_1f_sanity_check():
    """ Sanity check for pad_sents_char() function. 
    """
    print ("-"*80)
    print("Running Sanity Check for Question 1f: Padding")
    print ("-"*80)
    vocab = VocabEntry()

    print("Running test on a list of sentences")
    sentences = [['Human:', 'What', 'do', 'we', 'want?'], ['Computer:', 'Natural', 'language', 'processing!'], ['Human:', 'When', 'do', 'we', 'want', 'it?'], ['Computer:', 'When', 'do', 'we', 'want', 'what?']]
    word_ids = vocab.words2charindices(sentences)

    padded_sentences = pad_sents_char(word_ids, 0)
    gold_padded_sentences = torch.load('./sanity_check_en_es_data/gold_padded_sentences.pkl')
    assert padded_sentences == gold_padded_sentences, "Sentence padding is incorrect: it should be:\n {} but is:\n{}".format(gold_padded_sentences, padded_sentences)

    test_list = [[[4]*33]]
    padded_sent = pad_sents_char(test_list, 0)
    assert len(padded_sent[0][0]) == 21
    print("Sanity Check Passed for Question 1f: Padding!")
    print("-"*80)
Beispiel #24
0
    def _from_json(self):
        vocab_file = "qna_data/{}_vocab.json".format(self.method)
        dataset_file = "qna_data/{}_dataset.json".format(self.method)

        self.vocab = VocabEntry.from_json(vocab_file)

        dataset_json = read_json_data(dataset_file)

        for key in self.train_keys:
            setattr(self, key, dataset_json[key])

        self._to_numpy()
Beispiel #25
0
 def load_vocab(self):
     # Load the vocabulary or create vocabulary if not exists
     if self.args.vocab is not None:
         if not os.path.isfile(self.args.vocab):
             print('create new vocab and save to %s' % self.args.vocab)
             corpus = []
             for story in self.trn[0]:
                 for sent in story:
                     corpus.append(sent)
             if self.args.rebuild_vocab:
                 self.vocab = VocabEntry.from_corpus(
                     corpus,
                     50000,
                     remove_singleton=not self.args.include_singleton)
             else:
                 self.vocab = VocabEntry.from_dict(self.w2id)
             torch.save(self.vocab, self.args.vocab)
         else:
             self.vocab = torch.load(self.args.vocab)
     else:
         print('vocab file is required')
         exit(0)
Beispiel #26
0
def question_1f_sanity_check():
    print("running sanity check for 1f:cnn")
    VocabEntry = VocabEntry()
    sentences = [['Human', ':', 'What', 'do', 'we', 'want', '?'],
                 ['Computer', ':', 'Natural', 'language', 'processing', '!'],
                 ['Human', ':', 'When', 'do', 'we', 'want', 'it', '?'],
                 ['Computer', ':', 'When', 'do', 'we', 'want', 'what', '?']]
    sentence_length = 8
    BATCH_SIZE = 4
    word_length = 12
    x_reshape = vocabEntry.to_input_tensor_char(sentences, 'cpu')
    cnn = CNN(k=5, f=2, emb_size=word_length, m_word=sentence_length)
    conv_out = self.cnn(x_reshape)
    print(conv_out.size())
Beispiel #27
0
def question_1i_sanity_check():
    """ Sanity check for nmt_model.py
        basic shape check
    """
    print("-" * 80)
    print("Running Sanity Check for Question 1i: NMT")
    print("-" * 80)
    src_vocab_entry = VocabEntry()
    tgt_vocab_entry = VocabEntry()
    dummy_vocab = Vocab(src_vocab_entry, tgt_vocab_entry)
    word_embed_size = 5
    hidden_size = 10

    nmt = NMT(word_embed_size, hidden_size, dummy_vocab)
    source = [["Hello my friend"], ["How are you"]]
    target = [["Bonjour mon ami"], ["Comment vas tu"]]
    output = nmt.forward(source, target)

    print(output)
    #output_expected_size = [sentence_length, BATCH_SIZE, EMBED_SIZE]
    #assert(list(output.size()) == output_expected_size), "output shape is incorrect: it should be:\n {} but is:\n{}".format(output_expected_size, list(output.size()))
    print("Sanity Check Passed for Question 1i: NMT!")
    print("-" * 80)
Beispiel #28
0
    def train_forward(self, char_sequence, dec_hidden=None):
        """ Forward computation during training.

        @param char_sequence (Tensor): tensor of integers, shape (length, batch_size). Note that "length" here and in forward() need not be the same.
        @param dec_hidden (tuple(Tensor, Tensor)): initial internal state of the LSTM, obtained from the output of the word-level decoder. A tuple of two tensors of shape (1, batch_size, hidden_size)

        @returns The cross-entropy loss (Tensor), computed as the *sum* of cross-entropy losses of all the words in the batch.
        """
        ### YOUR CODE HERE for part 2b
        ### TODO - Implement training forward pass.
        ###
        ### Hint: - Make sure padding characters do not contribute to the cross-entropy loss. Check vocab.py to find the padding token's index.
        ###       - char_sequence corresponds to the sequence x_1 ... x_{n+1} (e.g., <START>,m,u,s,i,c,<END>). Read the handout about how to construct input and target sequence of CharDecoderLSTM.
        ###       - Carefully read the documentation for nn.CrossEntropyLoss and our handout to see what this criterion have already included:
        ###             https://pytorch.org/docs/stable/nn.html#crossentropyloss

        # char_sequence: [length, b] => delete end_token => input_sequence: [length, b]
        X_input = char_sequence[:-1]

        # char_sequence: [length, b] => delete start_token => input_sequence: [length, b]
        X_target = char_sequence[1:]

        # X_input: [length, b], dec_hidden = (h_n, c_n): ([1, b, h], [1, b, h])
        #    ==> softmax   ==>
        # s_t: [length, b, self.vocab_size], dec_hidden = (h_n, c_n): ([1, b, h], [1, b, h])
        s_t, dec_hidden = self.forward(X_input, dec_hidden)

        # For lookup char_pad index value, shall be 0
        vocab_entry = VocabEntry()
        idx_char_pad = vocab_entry.char_pad

        # Initialiate CrossEntropyLoss Instances, combines logsoftmax and nllloss
        compute_loss = nn.CrossEntropyLoss(ignore_index=idx_char_pad,
                                           reduction='sum')

        # Reshape s_t for compute_loss, length*b => b_char
        # length = length of a word, b = batch size, length*b = # of characters in the batch
        # s_t: [length, b, self.vocab_size] ==> s_t: [length*b, self.vocab_size] = [N, C]
        s_t = s_t.reshape(s_t.shape[0] * s_t.shape[1], -1)

        # Reshape X_target for compute_loss
        # X_target: [length, b] ==> X_target: [length*b] = [N]
        X_target = X_target.reshape(-1)

        # s_t: [length*b, self.vocab_size] = [N, C, d1...dk], X_target: [length*b] = [N]
        #   ==> compute_loss ==> loss_char_dec:
        loss_char_dec = compute_loss(s_t, X_target)

        return loss_char_dec
def question_1g_sanity_check():
    """
    Sanity check for to_input_tensor_char() function
    :return:
    """
    print("-" * 80)
    print("Running Sanity Check for Question 1g")
    print("-" * 80)
    vocab = VocabEntry()

    print("Running test on a list of sentences")
    sentences = [['Human:', 'What', 'do', 'we', 'want?'], ['Computer:', 'Natural', 'language', 'processing!'],
                 ['Human:', 'When', 'do', 'we', 'want', 'it?'], ['Computer:', 'When', 'do', 'we', 'want', 'what?']]
    word_ids = vocab.words2charindices(sentences)
    padded_sentences = pad_sents_char(word_ids, 0)

    a = np.asarray(padded_sentences).transpose((1, 0, 2))
    a = torch.Tensor(a)
    b = vocab.to_input_tensor_char(sentences, device="cpu")

    assert a.equal(b), "Wrong!"

    print("Sanity Check Passed for Question 1g")
    print("-" * 80)
    def test_question_1e_sanity_check(self):
        """ Sanity check for words2charindices function.
        """
        vocab = VocabEntry()


        sentences = [["a", "b", "c?"], ["~d~", "c", "b", "a"]]
        small_ind = vocab.words2charindices(sentences)
        small_ind_gold = [[[1, 30, 2], [1, 31, 2], [1, 32, 70, 2]], [[1, 85, 33, 85, 2], [1, 32, 2], [1, 31, 2], [1, 30, 2]]]
        assert(small_ind == small_ind_gold), \
            "small test resulted in indices list {:}, expected {:}".format(small_ind, small_ind_gold)

        # print('Running test on single sentence')
        # sentence = ["right", "arcs", "only"]
        # single_ind = vocab.words2charindices(sentence)
        # single_ind_gold = [[[1, 47, 2], [1, 38, 2], [1, 36, 2], [1, 37, 2], [1, 49, 2]], [[1, 30, 2], [1, 47, 2], [1, 32, 2], [1, 48, 2]], [[1, 44, 2], [1, 43, 2], [1, 41, 2], [1, 54, 2]]]
        # assert(single_ind == single_ind_gold), \
        #     "single sentence test resulted in indices list {:}, expected {:}".format(single_ind, single_ind_gold)

        print('Running test on large list of sentences')
        tgt_sents = [['<s>', "Let's", 'start', 'by', 'thinking', 'about', 'the', 'member', 'countries', 'of', 'the', 'OECD,', 'or', 'the', 'Organization', 'of', 'Economic', 'Cooperation', 'and', 'Development.', '</s>'], ['<s>', 'In', 'the', 'case', 'of', 'gun', 'control,', 'we', 'really', 'underestimated', 'our', 'opponents.', '</s>'], ['<s>', 'Let', 'me', 'share', 'with', 'those', 'of', 'you', 'here', 'in', 'the', 'first', 'row.', '</s>'], ['<s>', 'It', 'suggests', 'that', 'we', 'care', 'about', 'the', 'fight,', 'about', 'the', 'challenge.', '</s>'], ['<s>', 'A', 'lot', 'of', 'numbers', 'there.', 'A', 'lot', 'of', 'numbers.', '</s>']]
        tgt_ind = vocab.words2charindices(tgt_sents)
        tgt_ind_gold = pickle.load(open('./sanity_check_en_es_data/1e_tgt.pkl', 'rb'))
        assert(tgt_ind == tgt_ind_gold), "target vocab test resulted in indices list {:}, expected {:}".format(tgt_ind, tgt_ind_gold)