Example #1
0
def test_contextual_train(monkeypatch: Any) -> None:
    """Test training loop for a contextual module."""
    set_seed(1000)

    monkeypatch.setattr(wandb, 'log', _noop)

    df = read_test_data()
    train_dataset = mctd.MeldContextualTextDataset(df, mode='emotion')
    dev_dataset = mctd.MeldContextualTextDataset(df,
                                                 mode='emotion',
                                                 vocab=train_dataset.vocab)
    glove_file = StringIO(tm.glove_str)

    train_loader = mctd.meld_contextual_text_daloader(
        dataset=train_dataset,
        batch_size=tm.batch_size,
    )
    dev_loader = mctd.meld_contextual_text_daloader(
        dataset=dev_dataset,
        batch_size=tm.batch_size,
    )

    classifier = glove_contextual_simple(
        glove_path=glove_file,
        glove_dim=tm.glove_dim,
        num_classes=tm.num_classes,
        vocab=train_dataset.vocab,
    )

    train(model=classifier, trainloader=train_loader, devloader=dev_loader)
Example #2
0
def test_train_cuda(monkeypatch: Any) -> None:
    """Test if Simple trains on CUDA."""
    set_seed(1000)

    monkeypatch.setattr(wandb, 'log', _noop)

    df = read_test_data()
    train_dataset = mltd.MeldLinearTextDataset(df, mode='emotion')
    dev_dataset = mltd.MeldLinearTextDataset(df,
                                             mode='emotion',
                                             vocab=train_dataset.vocab)
    glove_file = StringIO(tm.glove_str)

    train_loader = mltd.meld_linear_text_daloader(
        dataset=train_dataset,
        batch_size=tm.batch_size,
    )
    dev_loader = mltd.meld_linear_text_daloader(
        dataset=dev_dataset,
        batch_size=tm.batch_size,
    )

    classifier = glove_simple(
        glove_path=glove_file,
        glove_dim=tm.glove_dim,
        num_classes=tm.num_classes,
        vocab=train_dataset.vocab,
    )

    train(model=classifier,
          trainloader=train_loader,
          devloader=dev_loader,
          gpu=0)
Example #3
0
def test_train_weights(monkeypatch: Any) -> None:
    """Test training loop with weights for loss function."""
    set_seed(1000)
    weights = torch.tensor([4.0, 15.0, 15.0, 3.0, 1.0, 6.0, 3.0])

    monkeypatch.setattr(wandb, 'log', _noop)

    df = read_test_data()
    train_dataset = mltd.MeldLinearTextDataset(df, mode='emotion')
    dev_dataset = mltd.MeldLinearTextDataset(df,
                                             mode='emotion',
                                             vocab=train_dataset.vocab)
    glove_file = StringIO(tm.glove_str)

    train_loader = mltd.meld_linear_text_daloader(
        dataset=train_dataset,
        batch_size=tm.batch_size,
    )
    dev_loader = mltd.meld_linear_text_daloader(
        dataset=dev_dataset,
        batch_size=tm.batch_size,
    )

    classifier = glove_simple(
        glove_path=glove_file,
        glove_dim=tm.glove_dim,
        num_classes=tm.num_classes,
        vocab=train_dataset.vocab,
    )

    train(model=classifier,
          trainloader=train_loader,
          devloader=dev_loader,
          weights=weights)
Example #4
0
def test_vocabularies_are_equal() -> None:
    """Test if vocabularies by all datasets are equal to each other."""
    df = read_test_data()
    context_dataset = MeldContextualTextDataset(df, mode='sentiment')
    linear_dataset = MeldLinearTextDataset(df, mode='sentiment')

    assert context_dataset.vocab_size() == linear_dataset.vocab_size()
Example #5
0
def test_preprocessing() -> None:
    df = read_test_data()
    processed = data.preprocess_data(df)

    assert list(processed.iloc[0]['Tokens']) == test_tokens[0]
    assert list(processed.iloc[1]['Tokens']) == test_tokens[1]
    assert list(processed.iloc[2]['Tokens']) == test_tokens[2]
Example #6
0
def test_vocabulary_unknown_index() -> None:
    """Test if vocabulary returns unkown word for unknown index."""
    df = read_test_data()
    dataset = MeldContextualTextDataset(df, mode='emotions')
    vocab = dataset.vocab

    assert vocab.index2word(100000) == vocab.unk_token
Example #7
0
def test_bc_lstm() -> None:
    """Test if bcLSTM works with synthetic data."""
    df = read_test_data()
    dataset = MeldContextualTextDataset(df, mode='emotion')
    glove_file = StringIO(glove_str)

    batch_size = 2
    nutterances = 2

    loader = meld_contextual_text_daloader(
        dataset=dataset,
        batch_size=batch_size,
    )

    classifier = glove_bc_lstm(
        glove_path=glove_file,
        glove_dim=glove_dim,
        num_classes=num_classes,
        vocab=dataset.vocab,
        filters=[3, 5],
        out_channels=3,
    )

    for batch in loader:
        predictions, loss = classifier(batch.tokens, batch.masks, batch.labels)
        assert predictions.shape == (batch_size, nutterances, num_classes)
        assert loss.shape == ()  # scalar
Example #8
0
def test_vocabulary_known_index() -> None:
    """Test if vocabulary returns correct word for known index."""
    df = read_test_data()
    dataset = MeldContextualTextDataset(df, mode='emotions')
    vocab = dataset.vocab

    word = vocab.index2word(2)  # first word non PAD or UNK for non-empty vocab
    assert word != vocab.unk_token
Example #9
0
def test_vocabulary_unknown_word() -> None:
    """Test if vocabulary returns unkown index for unknown word."""
    df = read_test_data()
    dataset = MeldContextualTextDataset(df, mode='emotions')
    vocab = dataset.vocab

    unkown_index = vocab.word2index(vocab.unk_token)
    assert vocab.word2index('COMPLETELYIMPOSSIBLETOKNOW') == unkown_index
Example #10
0
def test_word_types() -> None:
    df = read_test_data()
    df = preprocess_data(df)
    words = flatten2list(list(df['Tokens']))

    word_types = data.get_word_types(words)

    for tokens in test_tokens:
        assert all(token in word_types for token in tokens)
Example #11
0
def test_vocabulary_token_id_map() -> None:
    """Test if vocab can correctly map list of words to integers and back."""
    df = read_test_data()
    dataset = MeldContextualTextDataset(df, mode='emotions')
    vocab = dataset.vocab

    words = ['oh', 'my', 'god']
    indexes = vocab.map_tokens_to_ids(words)
    new_words = vocab.map_ids_to_tokens(indexes)

    assert words == new_words
Example #12
0
def test_index_uniqueness() -> None:
    "Test if every token has an unique index"
    df = read_test_data()
    df = preprocess_data(df)
    words = flatten2list(list(df['Tokens']))

    word_types = data.get_word_types(words)

    word2idx, idx2word = data.build_indexes(word_types, PAD_TOKEN, UNK_TOKEN)

    for sentence in test_tokens:
        indexes = [word2idx[token] for token in set(sentence)]
        assert len(indexes) == len(set(indexes))
Example #13
0
def test_linear_dataset_sentiment() -> None:
    df = read_test_data()
    df = preprocess_data(df)
    dataset = MeldLinearTextDataset(df, mode='sentiment')

    assert dataset[0].dialogue_id == 0
    assert dataset[0].utterance_id == 0
    assert dataset[0].label.equal(torch.tensor(2))
    assert len(dataset[0].tokens) == len(test_tokens[0])

    assert dataset[1].dialogue_id == 0
    assert dataset[1].utterance_id == 1
    assert dataset[1].label.equal(torch.tensor(2))
    assert len(dataset[1].tokens) == len(test_tokens[1])
Example #14
0
def test_build_indexes() -> None:
    "Test if every token has an index and the two-way mapping is right"
    df = read_test_data()
    df = preprocess_data(df)
    words = flatten2list(list(df['Tokens']))

    word_types = data.get_word_types(words)

    word2idx, idx2word = data.build_indexes(word_types, PAD_TOKEN, UNK_TOKEN)

    for sentence in test_tokens:
        for token in sentence:
            assert token in word2idx
            index = word2idx[token]
            assert idx2word[index] == token
Example #15
0
def test_contextual_dataset_sentiment() -> None:
    df = read_test_data()
    dataset = MeldContextualTextDataset(df, mode='sentiment')

    max_len0 = max(len(test_tokens[0]), len(test_tokens[1]))

    assert dataset[0].dialogue_id == 0
    assert dataset[0].labels.equal(torch.tensor([2, 2]))
    assert len(dataset[0].tokens) == 2
    assert len(dataset[0].tokens[0]) == max_len0
    assert len(dataset[0].tokens[1]) == max_len0

    assert dataset[1].dialogue_id == 1
    assert dataset[1].labels.equal(torch.tensor([1]))
    assert len(dataset[1].tokens) == 1
    assert len(dataset[1].tokens[0]) == len(test_tokens[2])
Example #16
0
def test_contextual_dataloader() -> None:
    df = read_test_data()
    dataset = MeldContextualTextDataset(df, mode='emotions')
    loader = meld_contextual_text_daloader(
        dataset=dataset,
        batch_size=2,
    )
    length0 = len(test_tokens[0])
    length1 = len(test_tokens[1])
    length2 = len(test_tokens[2])
    max_length = max(length0, length1, length2)

    for batch in loader:
        assert batch.dialogue_ids.equal(torch.tensor([1, 0]))
        assert batch.labels.equal(torch.tensor([[4, 0], [6, 5]]))
        assert batch.lengths.equal(
            torch.tensor([[length2, 0], [length0, length1]]))
        assert batch.tokens.shape == (2, 2, max_length)
Example #17
0
def test_linear_dataloader() -> None:
    df = read_test_data()
    dataset = MeldLinearTextDataset(df)
    loader = meld_linear_text_daloader(
        dataset=dataset,
        batch_size=3,
    )
    length0 = len(test_tokens[0])
    length1 = len(test_tokens[1])
    length2 = len(test_tokens[2])
    max_length = max(length0, length1, length2)

    for batch in loader:
        assert batch.dialogue_ids.equal(torch.tensor([1, 0, 0]))
        assert batch.utterance_ids.equal(torch.tensor([0, 0, 1]))
        assert batch.labels.equal(torch.tensor([1, 2, 2]))
        assert batch.lengths.equal(torch.tensor([length2, length0, length1]))
        assert all(len(seq) == max_length for seq in batch.tokens)
Example #18
0
def test_random_simple() -> None:
    df = read_test_data()
    dataset = MeldLinearTextDataset(df, mode='emotion')

    loader = meld_linear_text_daloader(
        dataset=dataset,
        batch_size=batch_size,
    )

    classifier = random_emb_simple(
        vocab_size=dataset.vocab_size(),
        embedding_dim=embedding_dim,
        num_classes=num_classes,
    )

    for batch in loader:
        predictions, _ = classifier(batch.tokens, batch.labels)
        assert predictions.shape == (batch_size, num_classes)
Example #19
0
def test_linear_rnn() -> None:
    "Test if Linear Rnn GloVe loader works with synthetic data"
    df = read_test_data()
    dataset = MeldLinearTextDataset(df, mode='emotion')
    glove_file = StringIO(glove_str)

    loader = meld_linear_text_daloader(
        dataset=dataset,
        batch_size=batch_size,
    )

    classifier = glove_linear_lstm(
        glove_path=glove_file,
        glove_dim=glove_dim,
        num_classes=num_classes,
        vocab=dataset.vocab,
    )

    for batch in loader:
        predictions, _ = classifier(batch.tokens, batch.labels)
        assert predictions.shape == (batch_size, num_classes)
Example #20
0
def test_linear_cnn_rnn() -> None:
    """Test if Linear Cnn+Rnn model works with synthetic data."""
    df = read_test_data()
    dataset = MeldLinearTextDataset(df, mode='emotion')
    glove_file = StringIO(glove_str)

    loader = meld_linear_text_daloader(
        dataset=dataset,
        batch_size=3,
    )

    classifier = glove_linear_cnn_lstm(
        glove_path=glove_file,
        glove_dim=glove_dim,
        num_classes=num_classes,
        vocab=dataset.vocab,
        filters=[3, 5],
        out_channels=3,
    )

    for batch in loader:
        predictions, _ = classifier(batch.tokens, batch.labels)
        assert predictions.shape == (batch_size, num_classes)