def test_contextual_train(monkeypatch: Any) -> None: """Test training loop for a contextual module.""" set_seed(1000) monkeypatch.setattr(wandb, 'log', _noop) df = read_test_data() train_dataset = mctd.MeldContextualTextDataset(df, mode='emotion') dev_dataset = mctd.MeldContextualTextDataset(df, mode='emotion', vocab=train_dataset.vocab) glove_file = StringIO(tm.glove_str) train_loader = mctd.meld_contextual_text_daloader( dataset=train_dataset, batch_size=tm.batch_size, ) dev_loader = mctd.meld_contextual_text_daloader( dataset=dev_dataset, batch_size=tm.batch_size, ) classifier = glove_contextual_simple( glove_path=glove_file, glove_dim=tm.glove_dim, num_classes=tm.num_classes, vocab=train_dataset.vocab, ) train(model=classifier, trainloader=train_loader, devloader=dev_loader)
def test_train_cuda(monkeypatch: Any) -> None: """Test if Simple trains on CUDA.""" set_seed(1000) monkeypatch.setattr(wandb, 'log', _noop) df = read_test_data() train_dataset = mltd.MeldLinearTextDataset(df, mode='emotion') dev_dataset = mltd.MeldLinearTextDataset(df, mode='emotion', vocab=train_dataset.vocab) glove_file = StringIO(tm.glove_str) train_loader = mltd.meld_linear_text_daloader( dataset=train_dataset, batch_size=tm.batch_size, ) dev_loader = mltd.meld_linear_text_daloader( dataset=dev_dataset, batch_size=tm.batch_size, ) classifier = glove_simple( glove_path=glove_file, glove_dim=tm.glove_dim, num_classes=tm.num_classes, vocab=train_dataset.vocab, ) train(model=classifier, trainloader=train_loader, devloader=dev_loader, gpu=0)
def test_train_weights(monkeypatch: Any) -> None: """Test training loop with weights for loss function.""" set_seed(1000) weights = torch.tensor([4.0, 15.0, 15.0, 3.0, 1.0, 6.0, 3.0]) monkeypatch.setattr(wandb, 'log', _noop) df = read_test_data() train_dataset = mltd.MeldLinearTextDataset(df, mode='emotion') dev_dataset = mltd.MeldLinearTextDataset(df, mode='emotion', vocab=train_dataset.vocab) glove_file = StringIO(tm.glove_str) train_loader = mltd.meld_linear_text_daloader( dataset=train_dataset, batch_size=tm.batch_size, ) dev_loader = mltd.meld_linear_text_daloader( dataset=dev_dataset, batch_size=tm.batch_size, ) classifier = glove_simple( glove_path=glove_file, glove_dim=tm.glove_dim, num_classes=tm.num_classes, vocab=train_dataset.vocab, ) train(model=classifier, trainloader=train_loader, devloader=dev_loader, weights=weights)
def test_vocabularies_are_equal() -> None: """Test if vocabularies by all datasets are equal to each other.""" df = read_test_data() context_dataset = MeldContextualTextDataset(df, mode='sentiment') linear_dataset = MeldLinearTextDataset(df, mode='sentiment') assert context_dataset.vocab_size() == linear_dataset.vocab_size()
def test_preprocessing() -> None: df = read_test_data() processed = data.preprocess_data(df) assert list(processed.iloc[0]['Tokens']) == test_tokens[0] assert list(processed.iloc[1]['Tokens']) == test_tokens[1] assert list(processed.iloc[2]['Tokens']) == test_tokens[2]
def test_vocabulary_unknown_index() -> None: """Test if vocabulary returns unkown word for unknown index.""" df = read_test_data() dataset = MeldContextualTextDataset(df, mode='emotions') vocab = dataset.vocab assert vocab.index2word(100000) == vocab.unk_token
def test_bc_lstm() -> None: """Test if bcLSTM works with synthetic data.""" df = read_test_data() dataset = MeldContextualTextDataset(df, mode='emotion') glove_file = StringIO(glove_str) batch_size = 2 nutterances = 2 loader = meld_contextual_text_daloader( dataset=dataset, batch_size=batch_size, ) classifier = glove_bc_lstm( glove_path=glove_file, glove_dim=glove_dim, num_classes=num_classes, vocab=dataset.vocab, filters=[3, 5], out_channels=3, ) for batch in loader: predictions, loss = classifier(batch.tokens, batch.masks, batch.labels) assert predictions.shape == (batch_size, nutterances, num_classes) assert loss.shape == () # scalar
def test_vocabulary_known_index() -> None: """Test if vocabulary returns correct word for known index.""" df = read_test_data() dataset = MeldContextualTextDataset(df, mode='emotions') vocab = dataset.vocab word = vocab.index2word(2) # first word non PAD or UNK for non-empty vocab assert word != vocab.unk_token
def test_vocabulary_unknown_word() -> None: """Test if vocabulary returns unkown index for unknown word.""" df = read_test_data() dataset = MeldContextualTextDataset(df, mode='emotions') vocab = dataset.vocab unkown_index = vocab.word2index(vocab.unk_token) assert vocab.word2index('COMPLETELYIMPOSSIBLETOKNOW') == unkown_index
def test_word_types() -> None: df = read_test_data() df = preprocess_data(df) words = flatten2list(list(df['Tokens'])) word_types = data.get_word_types(words) for tokens in test_tokens: assert all(token in word_types for token in tokens)
def test_vocabulary_token_id_map() -> None: """Test if vocab can correctly map list of words to integers and back.""" df = read_test_data() dataset = MeldContextualTextDataset(df, mode='emotions') vocab = dataset.vocab words = ['oh', 'my', 'god'] indexes = vocab.map_tokens_to_ids(words) new_words = vocab.map_ids_to_tokens(indexes) assert words == new_words
def test_index_uniqueness() -> None: "Test if every token has an unique index" df = read_test_data() df = preprocess_data(df) words = flatten2list(list(df['Tokens'])) word_types = data.get_word_types(words) word2idx, idx2word = data.build_indexes(word_types, PAD_TOKEN, UNK_TOKEN) for sentence in test_tokens: indexes = [word2idx[token] for token in set(sentence)] assert len(indexes) == len(set(indexes))
def test_linear_dataset_sentiment() -> None: df = read_test_data() df = preprocess_data(df) dataset = MeldLinearTextDataset(df, mode='sentiment') assert dataset[0].dialogue_id == 0 assert dataset[0].utterance_id == 0 assert dataset[0].label.equal(torch.tensor(2)) assert len(dataset[0].tokens) == len(test_tokens[0]) assert dataset[1].dialogue_id == 0 assert dataset[1].utterance_id == 1 assert dataset[1].label.equal(torch.tensor(2)) assert len(dataset[1].tokens) == len(test_tokens[1])
def test_build_indexes() -> None: "Test if every token has an index and the two-way mapping is right" df = read_test_data() df = preprocess_data(df) words = flatten2list(list(df['Tokens'])) word_types = data.get_word_types(words) word2idx, idx2word = data.build_indexes(word_types, PAD_TOKEN, UNK_TOKEN) for sentence in test_tokens: for token in sentence: assert token in word2idx index = word2idx[token] assert idx2word[index] == token
def test_contextual_dataset_sentiment() -> None: df = read_test_data() dataset = MeldContextualTextDataset(df, mode='sentiment') max_len0 = max(len(test_tokens[0]), len(test_tokens[1])) assert dataset[0].dialogue_id == 0 assert dataset[0].labels.equal(torch.tensor([2, 2])) assert len(dataset[0].tokens) == 2 assert len(dataset[0].tokens[0]) == max_len0 assert len(dataset[0].tokens[1]) == max_len0 assert dataset[1].dialogue_id == 1 assert dataset[1].labels.equal(torch.tensor([1])) assert len(dataset[1].tokens) == 1 assert len(dataset[1].tokens[0]) == len(test_tokens[2])
def test_contextual_dataloader() -> None: df = read_test_data() dataset = MeldContextualTextDataset(df, mode='emotions') loader = meld_contextual_text_daloader( dataset=dataset, batch_size=2, ) length0 = len(test_tokens[0]) length1 = len(test_tokens[1]) length2 = len(test_tokens[2]) max_length = max(length0, length1, length2) for batch in loader: assert batch.dialogue_ids.equal(torch.tensor([1, 0])) assert batch.labels.equal(torch.tensor([[4, 0], [6, 5]])) assert batch.lengths.equal( torch.tensor([[length2, 0], [length0, length1]])) assert batch.tokens.shape == (2, 2, max_length)
def test_linear_dataloader() -> None: df = read_test_data() dataset = MeldLinearTextDataset(df) loader = meld_linear_text_daloader( dataset=dataset, batch_size=3, ) length0 = len(test_tokens[0]) length1 = len(test_tokens[1]) length2 = len(test_tokens[2]) max_length = max(length0, length1, length2) for batch in loader: assert batch.dialogue_ids.equal(torch.tensor([1, 0, 0])) assert batch.utterance_ids.equal(torch.tensor([0, 0, 1])) assert batch.labels.equal(torch.tensor([1, 2, 2])) assert batch.lengths.equal(torch.tensor([length2, length0, length1])) assert all(len(seq) == max_length for seq in batch.tokens)
def test_random_simple() -> None: df = read_test_data() dataset = MeldLinearTextDataset(df, mode='emotion') loader = meld_linear_text_daloader( dataset=dataset, batch_size=batch_size, ) classifier = random_emb_simple( vocab_size=dataset.vocab_size(), embedding_dim=embedding_dim, num_classes=num_classes, ) for batch in loader: predictions, _ = classifier(batch.tokens, batch.labels) assert predictions.shape == (batch_size, num_classes)
def test_linear_rnn() -> None: "Test if Linear Rnn GloVe loader works with synthetic data" df = read_test_data() dataset = MeldLinearTextDataset(df, mode='emotion') glove_file = StringIO(glove_str) loader = meld_linear_text_daloader( dataset=dataset, batch_size=batch_size, ) classifier = glove_linear_lstm( glove_path=glove_file, glove_dim=glove_dim, num_classes=num_classes, vocab=dataset.vocab, ) for batch in loader: predictions, _ = classifier(batch.tokens, batch.labels) assert predictions.shape == (batch_size, num_classes)
def test_linear_cnn_rnn() -> None: """Test if Linear Cnn+Rnn model works with synthetic data.""" df = read_test_data() dataset = MeldLinearTextDataset(df, mode='emotion') glove_file = StringIO(glove_str) loader = meld_linear_text_daloader( dataset=dataset, batch_size=3, ) classifier = glove_linear_cnn_lstm( glove_path=glove_file, glove_dim=glove_dim, num_classes=num_classes, vocab=dataset.vocab, filters=[3, 5], out_channels=3, ) for batch in loader: predictions, _ = classifier(batch.tokens, batch.labels) assert predictions.shape == (batch_size, num_classes)