def test_1(self): """1b-1-basic: simple test case using unigram cost from the corpus""" self.assertEqual('word', submission.segmentWords('word', self.unigramCost)) self.assertEqual('two words', submission.segmentWords('twowords', self.unigramCost)) self.assertEqual( 'and three words', submission.segmentWords('andthreewords', self.unigramCost))
def test_0(self): """1b-0-basic: simple test case using hand-picked unigram costs.""" def unigramCost(x): if x in ['and', 'two', 'three', 'word', 'words']: return 1.0 else: return 1000.0 self.assertEqual('', submission.segmentWords('', unigramCost)) self.assertEqual('word', submission.segmentWords('word', unigramCost)) self.assertEqual('two words', submission.segmentWords('twowords', unigramCost)) self.assertEqual('and three words', submission.segmentWords('andthreewords', unigramCost))
def test_2(self): """1b-2-hidden: """ # Word seen in corpus solution1 = submission.segmentWords('pizza', self.unigramCost) # Even long unseen words are preferred to their arbitrary segmentations solution2 = submission.segmentWords('qqqqq', self.unigramCost) solution3 = submission.segmentWords('z' * 100, self.unigramCost) # But 'a' is a word solution4 = submission.segmentWords('aa', self.unigramCost) # With an apparent crossing point at length 6->7 solution5 = submission.segmentWords('aaaaaa', self.unigramCost) solution6 = submission.segmentWords('aaaaaaa', self.unigramCost)
def repl(unigramCost, bigramCost, possibleFills, command=None): '''REPL: read, evaluate, print, loop''' while True: sys.stdout.write('>> ') line = sys.stdin.readline().strip() if not line: break if command is None: cmdAndLine = line.split(None, 1) cmd, line = cmdAndLine[0], ' '.join(cmdAndLine[1:]) else: cmd = command line = line print ('') if cmd == 'help': print ('Usage: <command> [arg1, arg2, ...]') print ('') print ('Commands:') print ('\n'.join(a + '\t\t' + b for a, b in [ ('help', 'This'), ('seg', 'Segment character sequences as in 1b'), ('ins', 'Insert vowels into words as in 2b'), ('both', 'Joint segment-and-insert as in 3b'), ('fills', 'Query possibleFills() to see possible vowel-fillings of a word'), ('ug', 'Query unigram cost function, treating input as a single word'), ('bg', 'Call bigram cost function on the last two words of the input'), ])) print ('') print ('Enter empty line to quit') elif cmd == 'seg': line = wordsegUtil.cleanLine(line) parts = wordsegUtil.words(line) print (' Query (seg):', ' '.join(parts)) print ('') print (' ' + ' '.join( submission.segmentWords(part, unigramCost) for part in parts)) elif cmd == 'ins': line = wordsegUtil.cleanLine(line) ws = [wordsegUtil.removeAll(w, 'aeiou') for w in wordsegUtil.words(line)] print (' Query (ins):', ' '.join(ws)) print ('') print (' ' + submission.insertVowels(ws, bigramCost, possibleFills)) elif cmd == 'both': line = wordsegUtil.cleanLine(line) smoothCost = wordsegUtil.smoothUnigramAndBigram(unigramCost, bigramCost, 0.2) parts = [wordsegUtil.removeAll(w, 'aeiou') for w in wordsegUtil.words(line)] print (' Query (both):', ' '.join(parts)) print ('') print (' ' + ' '.join( submission.segmentAndInsert(part, smoothCost, possibleFills) for part in parts )) elif cmd == 'fills': line = wordsegUtil.cleanLine(line) print ('\n'.join(possibleFills(line))) elif cmd == 'ug': line = wordsegUtil.cleanLine(line) print (unigramCost(line)) elif cmd == 'bg': grams = tuple(wordsegUtil.words(line)) prefix, ending = grams[-2], grams[-1] print (bigramCost(prefix, ending)) else: print ('Unrecognized command:', cmd) print ('')
import grader import submission #_realUnigramCost, _realBigramCost = wordsegUtil.makeLanguageModels('toy-will.txt') def unigramCost(x): if x in ['and', 'two', 'three', 'word', 'words', 'there', 'the', 're']: return 1.0 else: return 1000.0 def equal(a, b): if a != b: raise "bad " + a + b equal('', submission.segmentWords('', unigramCost)) equal('word', submission.segmentWords('word', unigramCost)) equal('two words', submission.segmentWords('twowords', unigramCost)) equal('and three words', submission.segmentWords('andthreewords', unigramCost)) equal('there', submission.segmentWords('there', unigramCost)) equal('there word', submission.segmentWords('thereword', unigramCost)) equal('garbage', submission.segmentWords('garbage', unigramCost))
def repl(unigramCost, bigramCost, possibleFills, command=None): '''REPL: read, evaluate, print, loop''' while True: sys.stdout.write('>> ') line = sys.stdin.readline().strip() if not line: break if command is None: cmdAndLine = line.split(None, 1) cmd, line = cmdAndLine[0], ' '.join(cmdAndLine[1:]) else: cmd = command line = line print '' if cmd == 'help': print 'Usage: <command> [arg1, arg2, ...]' print '' print 'Commands:' print '\n'.join(a + '\t\t' + b for a, b in [ ('help', 'This'), ('seg', 'Segment character sequences'), ('ins', 'Insert vowels into words'), ('both', 'Joint segment-and-insert'), ('fills', 'Query possibleFills() to see possible vowel-fillings of a word'), ('ug', 'Query unigram cost function'), ('bg', 'Query bigram cost function'), ]) print '' print 'Enter empty line to quit' elif cmd == 'seg': line = wordsegUtil.cleanLine(line) parts = wordsegUtil.words(line) print ' Query (seg):', ' '.join(parts) print '' print ' ' + ' '.join( submission.segmentWords(part, unigramCost) for part in parts) elif cmd == 'ins': line = wordsegUtil.cleanLine(line) ws = [wordsegUtil.removeAll(w, 'aeiou') for w in wordsegUtil.words(line)] print ' Query (ins):', ' '.join(ws) print '' print ' ' + submission.insertVowels(ws, bigramCost, possibleFills) elif cmd == 'both': line = wordsegUtil.cleanLine(line) smoothCost = wordsegUtil.smoothUnigramAndBigram(unigramCost, bigramCost, 0.2) parts = [wordsegUtil.removeAll(w, 'aeiou') for w in wordsegUtil.words(line)] print ' Query (both):', ' '.join(parts) print '' print ' ' + ' '.join( submission.segmentAndInsert(part, smoothCost, possibleFills) for part in parts ) elif cmd == 'fills': line = wordsegUtil.cleanLine(line) print '\n'.join(possibleFills(line)) elif cmd == 'ug': line = wordsegUtil.cleanLine(line) print unigramCost(line) elif cmd == 'bg': grams = tuple(wordsegUtil.words(line)) prefix, ending = grams[-2], grams[-1] print bigramCost(prefix, ending) else: print 'Unrecognized command:', cmd print ''