def test_transformer_batch_tgt_masking():
    src_tokens = [["the", "cow", "jumped", "over", "the", "moon"],
                  ["the", "british", "are", "coming"]]
    tgt_tokens = [["la", "vache", "a", "sauté", "sur", "la", "lune"],
                  ["les", "britanniques", "arrivent"]]
    batch_size = len(src_tokens)
    dictionary_source = NLPVocabulary.build_vocabulary(src_tokens)
    dictionary_target = NLPVocabulary.build_vocabulary(tgt_tokens)
    max_seq_length = 10
    src_padded = TransformerDataset.padded_string_to_integer(
        src_tokens, max_seq_length, dictionary_source)
    tgt_padded = TransformerDataset.padded_string_to_integer(
        tgt_tokens, max_seq_length + 1, dictionary_target)
    batched_object_data = TransformerBatch(torch.LongTensor(src_padded),
                                           torch.LongTensor(tgt_padded))

    # test masking of target: blend of padding and auto-regressive masking

    # full mask at last part of sequence
    assert torch.equal(
        batched_object_data.tgt_mask[0, 9, :],
        torch.BoolTensor(
            [True, True, True, True, True, True, True, True, False, False]))
    # should be just the very first item
    assert torch.equal(
        batched_object_data.tgt_mask[0, 0, :],
        torch.BoolTensor([
            True, False, False, False, False, False, False, False, False, False
        ]))
def test_transformer_batch_dimensions():
    src_tokens = [["the", "cow", "jumped", "over", "the", "moon"],
                  ["the", "british", "are", "coming"]]
    tgt_tokens = [["la", "vache", "a", "sauté", "sur", "la", "lune"],
                  ["les", "britanniques", "arrivent"]]
    batch_size = len(src_tokens)
    dictionary_source = NLPVocabulary.build_vocabulary(src_tokens)
    dictionary_target = NLPVocabulary.build_vocabulary(tgt_tokens)
    max_seq_length = 20
    src_padded = TransformerDataset.padded_string_to_integer(
        src_tokens, max_seq_length, dictionary_source)
    tgt_padded = TransformerDataset.padded_string_to_integer(
        tgt_tokens, max_seq_length + 1, dictionary_target)
    batched_object_data = TransformerBatch(torch.LongTensor(src_padded),
                                           torch.LongTensor(tgt_padded))

    # test dimensions
    assert batched_object_data.src.size() == torch.Size(
        [batch_size, max_seq_length])
    assert batched_object_data.src_mask.size() == torch.Size(
        [batch_size, 1, max_seq_length])
    assert batched_object_data.tgt.size() == torch.Size(
        [batch_size, max_seq_length])
    assert batched_object_data.tgt_y.size() == torch.Size(
        [batch_size, max_seq_length])
    assert batched_object_data.tgt_mask.size() == torch.Size(
        [batch_size, max_seq_length, max_seq_length])
def test_transformer_batch_src_masking():
    src_tokens = [["the", "cow", "jumped", "over", "the", "moon"],
                  ["the", "british", "are", "coming"]]
    tgt_tokens = [["la", "vache", "a", "sauté", "sur", "la", "lune"],
                  ["les", "britanniques", "arrivent"]]
    batch_size = len(src_tokens)
    dictionary_source = NLPVocabulary.build_vocabulary(src_tokens)
    dictionary_target = NLPVocabulary.build_vocabulary(tgt_tokens)
    max_seq_length = 10
    src_padded = TransformerDataset.padded_string_to_integer(
        src_tokens, max_seq_length, dictionary_source)
    tgt_padded = TransformerDataset.padded_string_to_integer(
        tgt_tokens, max_seq_length + 1, dictionary_target)
    batched_object_data = TransformerBatch(torch.LongTensor(src_padded),
                                           torch.LongTensor(tgt_padded))

    # test masking
    # include EOS token
    assert torch.equal(
        batched_object_data.src_mask[0, :],
        torch.BoolTensor(
            [[True, True, True, True, True, True, True, False, False, False]]))
    assert torch.equal(
        batched_object_data.src_mask[1, :],
        torch.BoolTensor(
            [[True, True, True, True, True, False, False, False, False,
              False]]))
Ejemplo n.º 4
0
    def padded_string_to_integer(token_list: List[List[str]],
                                 max_sequence_length: int,
                                 vocab: NLPVocabulary) -> List[List[int]]:
        """
        Take a sequence of (string) tokens and convert them to a padded set of integers.

        Args:
            token_list (List[List[str]]):
                List of tokens to be converted to indices.
            max_sequence_length (int):
                Maximum length of sequence for each target, source sequence.
            vocab (NLPVocabulary):
                Dictionary to look up indices for each token.
        Returns:
            Sequence of indicies with EOS and PAD indices.
        """

        integer_list = []

        for tokens in token_list:
            integers = [vocab.mask_index] * max_sequence_length
            # this allows for truncated sequences.
            # In some problems, we will explicitly through out
            # datapoints < max_sequence_length prior to this step.
            integers[:len(tokens)] = [vocab.lookup_token(x)
                                      for x in tokens][:len(integers)]
            # Adding in the EOS token if the sequence is not truncated.
            if len(tokens) < max_sequence_length:
                integers[len(tokens)] = vocab.eos_index
            integer_list.append(integers)

        return integer_list
    def get_training_data(cls, max_sequence_length: int) -> Tuple[AbstractNLPDataset, NLPVocabulary]:
        """
        Download training data from huggingfaces, put into normalized formats.

        Args:
            max_sequence_length (int): The max sequence length.
        Returns:
            Tuple of the dataset and source and target dictionaries.
        """
        # download the IMDB data from hugginfaces for sentiment analysis
        dataset = load_dataset("imdb")['train']
        # note: targets are {0,1} and the data is not shuffled
        train_target, train_text = list(dataset.data[0]), list(dataset.data[1])
        # convert datatypes to native python
        train_text = [str(x) for x in train_text]
        train_target = [x.as_py() for x in train_target]
        # tokenize the data using our tokenizer
        train_text = tokenize_corpus_basic(train_text, False)
        # throw out any data points that are > max_length
        # train_text = [x for x in train_text if len(x) <= max_sequence_length - 1]
        # build our vocab on the stripped text
        vocab = NLPVocabulary.build_vocabulary(train_text)
        # remove some of the words so dictionary <<75k
        vocab_small = cls.prune_vocab(vocab, 1.e-6)
        # convert to into padded sequences of integers
        train_text = cls.padded_string_to_integer(train_text, max_sequence_length, vocab_small)

        return cls(list(zip(train_target, train_text)), vocab_small), vocab_small
def test_padded_string_to_integer_conversion():
    token_list = [["the", "cow", "jumped", "over", "the", "moon"]]
    vocab = NLPVocabulary.build_vocabulary(token_list)
    max_seq_length = 10
    padded_integers = TransformerDataset.padded_string_to_integer(
        token_list, max_seq_length, vocab)
    assert padded_integers[0] == [3, 4, 5, 6, 3, 7, 2, 0, 0, 0]
Ejemplo n.º 7
0
    def build_vocab(filepath: AnyStr, tokenizer):
        """
        This is a static method that builds an NLPVocabulary object from a file provided.

        Args:
            filepath(Anystr): This is a string for the filepath to open to build the vocab.
            tokenizer(function): This is a function to convert a list of strings into tokens.

        Returns:
            A NLPVocabulary object built off this list.
        """
        vocab = NLPVocabulary()
        with io.open(filepath, encoding="utf8") as f:
            for string_ in f:
                vocab.add_many(tokenizer([string_])[0])
        return vocab
def test_transformer_regression_test():
    utils.set_seed_everywhere()

    test_2_args = Namespace(
        num_layers_per_stack=2,
        dim_model=512,
        dim_ffn=2048,
        num_heads=8,
        max_sequence_length=20,
        dropout=0.1,
    )

    # mock dataset
    src_tokens = [["the", "cow", "jumped", "over", "the", "moon"],
                  ["the", "british", "are", "coming"]]
    tgt_tokens = [["la", "vache", "a", "sauté", "sur", "la", "lune"],
                  ["les", "britanniques", "arrivent"]]
    batch_size = len(src_tokens)
    dictionary_source = NLPVocabulary.build_vocabulary(src_tokens)
    dictionary_target = NLPVocabulary.build_vocabulary(tgt_tokens)
    max_seq_length = 20
    src_padded = TransformerDataset.padded_string_to_integer(
        src_tokens, max_seq_length, dictionary_source)
    tgt_padded = TransformerDataset.padded_string_to_integer(
        tgt_tokens, max_seq_length + 1, dictionary_target)
    data = TransformerBatch(torch.LongTensor(src_padded),
                            torch.LongTensor(tgt_padded))

    model = transformer.Transformer(len(dictionary_source),
                                    len(dictionary_target),
                                    test_2_args.num_layers_per_stack,
                                    test_2_args.dim_model, test_2_args.dim_ffn,
                                    test_2_args.num_heads,
                                    test_2_args.max_sequence_length,
                                    test_2_args.dropout)
    # push through model
    y_hat = model(data)

    # expected output
    expected_output = transformer_regression_test_data.TRANSFORMER_REGRESSION_TEST_DATA

    # assert y_hat is within eps
    eps = 1.e-4
    assert np.allclose(y_hat.data.numpy(),
                       expected_output.data.numpy(),
                       atol=eps)
def test_transformer_dataset_returns_two_tensors():
    src_tokens = [["the", "cow", "jumped", "over", "the", "moon"],
                  ["the", "british", "are", "coming"]]
    tgt_tokens = [["la", "vache", "a", "sauté", "sur", "la", "lune"],
                  ["les", "britanniques", "arrivent"]]
    dictionary_source = NLPVocabulary.build_vocabulary(src_tokens)
    dictionary_target = NLPVocabulary.build_vocabulary(tgt_tokens)
    max_seq_length = 20
    src_padded = TransformerDataset.padded_string_to_integer(
        src_tokens, max_seq_length, dictionary_source)
    tgt_padded = TransformerDataset.padded_string_to_integer(
        tgt_tokens, max_seq_length + 1, dictionary_target)

    dataset = TransformerDataset(list(zip(src_padded, tgt_padded)),
                                 dictionary_source)

    batch = dataset[1]

    assert type(batch[0]) == torch.Tensor and type(
        batch[1]) == torch.Tensor and len(batch) == 2
def test_input_output_dims_transformer():
    test_1_args = Namespace(
        num_layers_per_stack=2,
        dim_model=512,
        dim_ffn=2048,
        num_heads=8,
        max_sequence_length=20,
        dropout=0.1,
    )

    # mock dataset
    src_tokens = [["the", "cow", "jumped", "over", "the", "moon"],
                  ["the", "british", "are", "coming"]]
    tgt_tokens = [["la", "vache", "a", "sauté", "sur", "la", "lune"],
                  ["les", "britanniques", "arrivent"]]
    batch_size = len(src_tokens)
    dictionary_source = NLPVocabulary.build_vocabulary(src_tokens)
    dictionary_target = NLPVocabulary.build_vocabulary(tgt_tokens)
    max_seq_length = 20
    src_padded = TransformerDataset.padded_string_to_integer(
        src_tokens, max_seq_length, dictionary_source)
    tgt_padded = TransformerDataset.padded_string_to_integer(
        tgt_tokens, max_seq_length + 1, dictionary_target)
    data = TransformerBatch(torch.LongTensor(src_padded),
                            torch.LongTensor(tgt_padded))

    model = transformer.Transformer(len(dictionary_source),
                                    len(dictionary_target),
                                    test_1_args.num_layers_per_stack,
                                    test_1_args.dim_model, test_1_args.dim_ffn,
                                    test_1_args.num_heads,
                                    test_1_args.max_sequence_length,
                                    test_1_args.dropout)
    # push through model
    y_hat = model(data)

    # assert all dimensions are correct
    assert y_hat.size() == torch.Size(
        [batch_size, max_seq_length,
         len(dictionary_target)])
Ejemplo n.º 11
0
    def get_target_context_data(cls, train_text: List,
                                dictionary: NLPVocabulary, context_size: int,
                                train: bool) -> List:
        """
        Class method to take list of tokenized text and convert into sub-sampled (input,context) pairs.
        Note that sub-sampling only happens on the training dataset (see Mikolov et al. for details).

            Args:
                train_text (list): list of tokenized data to be used to derive (input,context) pairs.
                dictionary (NLPVocabulary): a dictionary built off of the training data to map tokens <-> idxs.
                context_size (int): the window around each input word to derive context pairings.
                train (bool): a "train" flag to indicate we want to sub-sample the training set.
            Returns:
                list of (input_idx, context_idx) pairs to be used for negative sampling loss problem.
        """
        train_data = []
        word_probas = dictionary.get_word_discard_probas()
        for tokens in train_text:
            tokens = [dictionary.lookup_token(x) for x in tokens]
            train_data.extend(
                cls.get_skipgram_context(tokens, context_size, word_probas,
                                         train))
        return train_data
Ejemplo n.º 12
0
    def prune_vocab(cls, vocab: NLPVocabulary,
                    prob_thresh: float) -> NLPVocabulary:
        """
        A simple method that reduces the dictionary of a corpus to be more manageable.

        Args:
            vocab (NLPVocabulary): The original dictionary.
            prob_thresh (float): threshold of word frequency over which to keep tokens.
        Returns:
            Pruned dictionary.
        """
        word_probas = vocab.get_word_frequencies()
        # special tokens have 0 word_counts
        # this is a hard-coded hyper-parameter
        keep_words = word_probas > prob_thresh
        idx_to_token = vocab.idx_to_token
        keep_tokens = []
        for idx, keep in enumerate(keep_words):
            if keep:
                keep_tokens.append(idx_to_token[idx])
        # re-build the dictionary
        vocab = NLPVocabulary.build_vocabulary([keep_tokens])
        return vocab
Ejemplo n.º 13
0
    def get_training_data(
            cls, block_size: int) -> Tuple[AbstractNLPDataset, NLPVocabulary]:
        """
        Returns the dataset class along with vocabulary object.

        Args:
            block_size (int): The size of the context window.
        Returns:
            Tuple of the dataset and dictionary.
        """
        # download the huggingfaces::wikitext language model development
        train_dataset = load_dataset("wikitext", 'wikitext-2-raw-v1')['train']
        # flatten the pyarrow chunks into one string
        train_dataset = [" ".join([str(x) for x in train_dataset._data[0]])]
        train_dataset = tokenize_corpus_basic(train_dataset, False)
        # hack: i'm going to only grab the first 300k examples. cause this is like > 1MM words
        # build vocabulary
        vocab = NLPVocabulary.build_vocabulary([train_dataset[0]])
        train_dataset = torch.LongTensor(
            [vocab.token_to_idx[x] for x in train_dataset[0]])
        # we pass the dataset, vocab... Dataset will do the rest
        return cls(train_dataset, vocab, block_size), vocab
Ejemplo n.º 14
0
    def get_training_data(
            cls, *args: Any) -> Tuple[AbstractNLPDataset, NLPVocabulary]:
        """
        Class method to generate the training dataset (derived from hugging faces "ag_news").
        This method grabs the raw text, tokenizes and cleans up the data, generates a dictionary,
        and generates a sub-sampled (input,context) pair for training.

        Returns:
            (NLPDataset,NLPVocabulary) tuple to be used downstream in training.
        """

        context_size, thresh = args
        # Using the Ag News data via Hugging Faces
        train_text = load_dataset("ag_news")['train']['text']
        train_text = tokenize_corpus_basic(train_text)
        dictionary = NLPVocabulary.build_vocabulary(train_text)
        # for sub-sampling
        dictionary.set_proba_thresh(thresh)
        train_data = cls.get_target_context_data(train_text,
                                                 dictionary,
                                                 context_size,
                                                 train=True)
        return cls(train_data), dictionary