def test_tokenize_dirty_text(self): """ Tokenize dirty text """ expected = ['the', 'first', 'sentence', 'the', 'second', 'sentence'] actual = tokenize('The first% sentence><. The sec&*ond sent@ence #.') self.assertEqual(expected, actual)
def test_tokenize_several_sentences(self): """ Tokenize text with several sentences """ expected = ['the', 'first', 'sentence', 'the', 'second', 'sentence'] actual = tokenize('The first sentence. The second sentence.') self.assertEqual(expected, actual)
def test_tokenize_line_breaks(self): """ Tokenize text with line breaks """ expected = ['the', 'first', 'sentence', 'the', 'second', 'sentence'] actual = tokenize( 'The first sentence.<br /><br />The second sentence.') self.assertEqual(expected, actual)
def test_tokenize_bad_input(self): """ Tokenize bad input argument scenario """ bad_inputs = [[], {}, (), None, 9, 9.34, True] expected = [] for bad_input in bad_inputs: actual = tokenize(bad_input) self.assertEqual(expected, actual)
def test_tokenize_ideal(self): """ Ideal tokenize scenario """ expected = [ 'the', 'weather', 'is', 'sunny', 'the', 'man', 'is', 'happy' ] actual = tokenize('The weather is sunny, the man is happy.') self.assertEqual(expected, actual)
def test_tokenize_big_text_length_equal(self): """ Tokenize big input text and assert equal """ text = read_from_file('lab_1/tokens.txt') expected = len(text.split()) actual = len(tokenize(text)) self.assertEqual(expected, actual)
def test_tokenize_big_text_case(self): """ Tokenize big input text scenario """ text = read_from_file('lab_1/tokens.txt') expected = text.split() actual = tokenize(text) self.assertEqual(expected, actual)
def test_big_text_get_adjacent_words_term(self): """ Checks if adjacent words for a given term can be found properly """ text = read_from_file('lab_1/data.txt') tokens = tokenize(text) expected = [['although', 'products']] actual = get_adjacent_words(tokens, 'tex', 4, 31) self.assertEqual(expected, actual)
def test_tokenize_punctuation_marks(self): """ Tokenize text with different punctuation marks """ expected = [ 'the', 'first', 'sentence', 'nice', 'the', 'second', 'sentence', 'bad' ] actual = tokenize( 'The, first sentence - nice. The second sentence: bad!') self.assertEqual(expected, actual)
def test_get_adjacent_words_several_contexts_big_text(self): """ Checks if adjacent words for a given term can be found in real text properly """ text = read_from_file('lab_1/data.txt') tokens = tokenize(text) expected = [['epithelial', 'channels'], ['means', 'aluminate'], ['by', 'bicarbonate'], ['the', 'salt']] actual = get_adjacent_words(tokens, 'sodium', 1, 1) self.assertEqual(expected, actual)
def test_get_concordance_several_contexts_big_text_right(self): """ Checks if contexts for a given term can be found in real text properly Taking into consideration right context """ text = read_from_file('lab_1/data.txt') tokens = tokenize(text) expected = [['means', 'sodium', 'aluminate'], ['by', 'sodium', 'bicarbonate'], ['epithelial', 'sodium', 'channels'], ['the', 'sodium', 'salt']] actual = sort_concordance(tokens, 'sodium', 1, 1, False) self.assertEqual(expected, actual)
def test_big_text_get_and_sort_concordance_term(self): """ Checks if a context sorts right for a given term and can be found properly """ text = read_from_file('lab_1/data.txt') tokens = tokenize(text) expected = [[ 'although', 'less', 'compact', 'than', 'tex', 'the', 'xml', 'structuring', 'promises', 'to', 'make', 'it', 'widely', 'usable', 'and', 'allows', 'for', 'instant', 'display' ]] actual = sort_concordance(tokens, 'tex', 4, 14, True) self.assertEqual(expected, actual)
def test_big_text_get_concordance_term(self): """ Checks if a context for a given term can be found properly """ text = read_from_file('lab_1/data.txt') tokens = tokenize(text) expected = [[ 'although', 'less', 'compact', 'than', 'tex', 'the', 'xml', 'structuring', 'promises', 'to', 'make', 'it', 'widely', 'usable', 'and', 'allows', 'for', 'instant', 'display', 'in', 'applications', 'such', 'as', 'web', 'browsers', 'and', 'facilitates', 'an', 'interpretation', 'of', 'its', 'meaning', 'in', 'mathematical', 'software', 'products' ]] actual = get_concordance(tokens, 'tex', 4, 31) self.assertEqual(expected, actual)