def test_preprocessing() -> None: df = read_test_data() processed = data.preprocess_data(df) assert list(processed.iloc[0]['Tokens']) == test_tokens[0] assert list(processed.iloc[1]['Tokens']) == test_tokens[1] assert list(processed.iloc[2]['Tokens']) == test_tokens[2]
def test_word_types() -> None: df = read_test_data() df = preprocess_data(df) words = flatten2list(list(df['Tokens'])) word_types = data.get_word_types(words) for tokens in test_tokens: assert all(token in word_types for token in tokens)
def test_index_uniqueness() -> None: "Test if every token has an unique index" df = read_test_data() df = preprocess_data(df) words = flatten2list(list(df['Tokens'])) word_types = data.get_word_types(words) word2idx, idx2word = data.build_indexes(word_types, PAD_TOKEN, UNK_TOKEN) for sentence in test_tokens: indexes = [word2idx[token] for token in set(sentence)] assert len(indexes) == len(set(indexes))
def __init__(self, data: DataFrameOrFilePath, vocab: Optional[Vocabulary] = None, mode: str = 'sentiment'): """Initialise Dataset with data, vocab and mode.""" if isinstance(data, (Path, str)): data = pd.read_csv(data) self.data = preprocess_data(data) if vocab is None: vocab = Vocabulary.build_vocab(self.data) self.vocab = vocab self.mode = mode
def test_linear_dataset_sentiment() -> None: df = read_test_data() df = preprocess_data(df) dataset = MeldLinearTextDataset(df, mode='sentiment') assert dataset[0].dialogue_id == 0 assert dataset[0].utterance_id == 0 assert dataset[0].label.equal(torch.tensor(2)) assert len(dataset[0].tokens) == len(test_tokens[0]) assert dataset[1].dialogue_id == 0 assert dataset[1].utterance_id == 1 assert dataset[1].label.equal(torch.tensor(2)) assert len(dataset[1].tokens) == len(test_tokens[1])
def test_build_indexes() -> None: "Test if every token has an index and the two-way mapping is right" df = read_test_data() df = preprocess_data(df) words = flatten2list(list(df['Tokens'])) word_types = data.get_word_types(words) word2idx, idx2word = data.build_indexes(word_types, PAD_TOKEN, UNK_TOKEN) for sentence in test_tokens: for token in sentence: assert token in word2idx index = word2idx[token] assert idx2word[index] == token