def test_permutation_decoder(): gutenberg = open_data("gutenberg.txt").read() flatland = open_data("EN-text/flatland.txt").read() pd = PermutationDecoder(canonicalize(gutenberg)) assert pd.decode('aba') in ('ece', 'ete', 'tat', 'tit', 'txt') pd = PermutationDecoder(canonicalize(flatland)) assert pd.decode('aba') in ('ded', 'did', 'ece', 'ele', 'eme', 'ere', 'eve', 'eye', 'iti', 'mom', 'ses', 'tat', 'tit')
def test_samples(): story = open_data("EN-text/flatland.txt").read() story += open_data("gutenberg.txt").read() wordseq = words(story) P1 = UnigramWordModel(wordseq) P2 = NgramWordModel(2, wordseq) P3 = NgramWordModel(3, wordseq) s1 = P1.samples(10) s2 = P3.samples(10) s3 = P3.samples(10) assert len(s1.split(' ')) == 10 assert len(s2.split(' ')) == 10 assert len(s3.split(' ')) == 10
def test_viterbi_segmentation(): flatland = open_data("EN-text/flatland.txt").read() wordseq = words(flatland) P = UnigramWordModel(wordseq) text = "itiseasytoreadwordswithoutspaces" s, p = viterbi_segment(text, P) assert s == [ 'it', 'is', 'easy', 'to', 'read', 'words', 'without', 'spaces' ]
def test_text_models(): flatland = open_data("EN-text/flatland.txt").read() wordseq = words(flatland) P1 = UnigramWordModel(wordseq) P2 = NgramWordModel(2, wordseq) P3 = NgramWordModel(3, wordseq) # Test top assert P1.top(5) == [(2081, 'the'), (1479, 'of'), (1021, 'and'), (1008, 'to'), (850, 'a')] assert P2.top(5) == [(368, ('of', 'the')), (152, ('to', 'the')), (152, ('in', 'the')), (86, ('of', 'a')), (80, ('it', 'is'))] assert P3.top(5) == [(30, ('a', 'straight', 'line')), (19, ('of', 'three', 'dimensions')), (16, ('the', 'sense', 'of')), (13, ('by', 'the', 'sense')), (13, ('as', 'well', 'as'))] # Test isclose assert isclose(P1['the'], 0.0611, rel_tol=0.001) assert isclose(P2['of', 'the'], 0.0108, rel_tol=0.01) assert isclose(P3['so', 'as', 'to'], 0.000323, rel_tol=0.001) # Test cond_prob.get assert P2.cond_prob.get(('went', )) is None assert P3.cond_prob['in', 'order'].dictionary == {'to': 6} # Test dictionary test_string = 'unigram' wordseq = words(test_string) P1 = UnigramWordModel(wordseq) assert P1.dictionary == {('unigram'): 1} test_string = 'bigram text' wordseq = words(test_string) P2 = NgramWordModel(2, wordseq) assert P2.dictionary == {('bigram', 'text'): 1} test_string = 'test trigram text here' wordseq = words(test_string) P3 = NgramWordModel(3, wordseq) assert ('test', 'trigram', 'text') in P3.dictionary assert ('trigram', 'text', 'here') in P3.dictionary
def test_rot13_decoding(): flatland = open_data("EN-text/flatland.txt").read() ring = ShiftDecoder(flatland) msg = ring.decode(rot13('Hello, world!')) assert msg == 'Hello, world!'
def test_shift_decoding(): flatland = open_data("EN-text/flatland.txt").read() ring = ShiftDecoder(flatland) msg = ring.decode('Kyzj zj r jvtivk dvjjrxv.') assert msg == 'This is a secret message.'
def test_parse_csv(): Iris = open_data('iris.csv').read() assert parse_csv(Iris)[0] == [5.1, 3.5, 1.4, 0.2, 'setosa']