def test_3(self): """3b-3-hidden: hidden test case with hand-picked bigram costs and possible fills""" def bigramCost(a, b): corpus = [wordsegUtil.SENTENCE_BEGIN] + 'beam me up scotty'.split() if (a, b) in list(zip(corpus, corpus[1:])): return 1.0 else: return 1000.0 def possibleFills(x): fills = { 'bm': set(['beam', 'bam', 'boom']), 'm': set(['me', 'ma']), 'p': set(['up', 'oop', 'pa', 'epe']), 'sctty': set(['scotty']), 'z': set(['ze']), } return fills.get(x, set()) # Ensure no non-word makes it through solution1 = submission.segmentAndInsert('zzzzz', bigramCost, possibleFills) solution2 = submission.segmentAndInsert('bm', bigramCost, possibleFills) solution3 = submission.segmentAndInsert('mp', bigramCost, possibleFills) solution4 = submission.segmentAndInsert('bmmpsctty', bigramCost, possibleFills)
def test_0(self): """3b-0-basic: Simple test case with hand-picked bigram costs and possible fills.""" def bigramCost(a, b): if b in ['and', 'two', 'three', 'word', 'words']: return 1.0 else: return 1000.0 fills_ = { 'nd': set(['and']), 'tw': set(['two']), 'thr': set(['three']), 'wrd': set(['word']), 'wrds': set(['words']), } fills = lambda x: fills_.get(x, set()) self.assertEqual('', submission.segmentAndInsert('', bigramCost, fills)) self.assertEqual('word', submission.segmentAndInsert('wrd', bigramCost, fills)) self.assertEqual( 'two words', submission.segmentAndInsert('twwrds', bigramCost, fills)) self.assertEqual( 'and three words', submission.segmentAndInsert('ndthrwrds', bigramCost, fills))
def test_2(self): """3b-2-hidden: hidden test case with unigram costs as bigram costs and additional possible fills.""" bigramCost = lambda a, b: self.unigramCost(b) fills_ = { 'nd': set(['and']), 'tw': set(['two']), 'thr': set(['three']), 'wrd': set(['word']), 'wrds': set(['words']), # Hah! Hit them with two better words 'th': set(['the']), 'rwrds': set(['rewards']), } fills = lambda x: fills_.get(x, set()) solution1 = submission.segmentAndInsert('wrd', bigramCost, fills) solution2 = submission.segmentAndInsert('twwrds', bigramCost, fills) # Waddaya know solution3 = submission.segmentAndInsert('ndthrwrds', bigramCost, fills)
def test_1(self): """3b-1-basic: simple test case with unigram costs as bigram costs""" bigramCost = lambda a, b: self.unigramCost(b) fills_ = { 'nd': set(['and']), 'tw': set(['two']), 'thr': set(['three']), 'wrd': set(['word']), 'wrds': set(['words']), } fills = lambda x: fills_.get(x, set()) self.assertEqual('word', submission.segmentAndInsert('wrd', bigramCost, fills)) self.assertEqual( 'two words', submission.segmentAndInsert('twwrds', bigramCost, fills)) self.assertEqual( 'and three words', submission.segmentAndInsert('ndthrwrds', bigramCost, fills))
def repl(unigramCost, bigramCost, possibleFills, command=None): '''REPL: read, evaluate, print, loop''' while True: sys.stdout.write('>> ') line = sys.stdin.readline().strip() if not line: break if command is None: cmdAndLine = line.split(None, 1) cmd, line = cmdAndLine[0], ' '.join(cmdAndLine[1:]) else: cmd = command line = line print ('') if cmd == 'help': print ('Usage: <command> [arg1, arg2, ...]') print ('') print ('Commands:') print ('\n'.join(a + '\t\t' + b for a, b in [ ('help', 'This'), ('seg', 'Segment character sequences as in 1b'), ('ins', 'Insert vowels into words as in 2b'), ('both', 'Joint segment-and-insert as in 3b'), ('fills', 'Query possibleFills() to see possible vowel-fillings of a word'), ('ug', 'Query unigram cost function, treating input as a single word'), ('bg', 'Call bigram cost function on the last two words of the input'), ])) print ('') print ('Enter empty line to quit') elif cmd == 'seg': line = wordsegUtil.cleanLine(line) parts = wordsegUtil.words(line) print (' Query (seg):', ' '.join(parts)) print ('') print (' ' + ' '.join( submission.segmentWords(part, unigramCost) for part in parts)) elif cmd == 'ins': line = wordsegUtil.cleanLine(line) ws = [wordsegUtil.removeAll(w, 'aeiou') for w in wordsegUtil.words(line)] print (' Query (ins):', ' '.join(ws)) print ('') print (' ' + submission.insertVowels(ws, bigramCost, possibleFills)) elif cmd == 'both': line = wordsegUtil.cleanLine(line) smoothCost = wordsegUtil.smoothUnigramAndBigram(unigramCost, bigramCost, 0.2) parts = [wordsegUtil.removeAll(w, 'aeiou') for w in wordsegUtil.words(line)] print (' Query (both):', ' '.join(parts)) print ('') print (' ' + ' '.join( submission.segmentAndInsert(part, smoothCost, possibleFills) for part in parts )) elif cmd == 'fills': line = wordsegUtil.cleanLine(line) print ('\n'.join(possibleFills(line))) elif cmd == 'ug': line = wordsegUtil.cleanLine(line) print (unigramCost(line)) elif cmd == 'bg': grams = tuple(wordsegUtil.words(line)) prefix, ending = grams[-2], grams[-1] print (bigramCost(prefix, ending)) else: print ('Unrecognized command:', cmd) print ('')
def repl(unigramCost, bigramCost, possibleFills, command=None): '''REPL: read, evaluate, print, loop''' while True: sys.stdout.write('>> ') line = sys.stdin.readline().strip() if not line: break if command is None: cmdAndLine = line.split(None, 1) cmd, line = cmdAndLine[0], ' '.join(cmdAndLine[1:]) else: cmd = command line = line print '' if cmd == 'help': print 'Usage: <command> [arg1, arg2, ...]' print '' print 'Commands:' print '\n'.join(a + '\t\t' + b for a, b in [ ('help', 'This'), ('seg', 'Segment character sequences'), ('ins', 'Insert vowels into words'), ('both', 'Joint segment-and-insert'), ('fills', 'Query possibleFills() to see possible vowel-fillings of a word'), ('ug', 'Query unigram cost function'), ('bg', 'Query bigram cost function'), ]) print '' print 'Enter empty line to quit' elif cmd == 'seg': line = wordsegUtil.cleanLine(line) parts = wordsegUtil.words(line) print ' Query (seg):', ' '.join(parts) print '' print ' ' + ' '.join( submission.segmentWords(part, unigramCost) for part in parts) elif cmd == 'ins': line = wordsegUtil.cleanLine(line) ws = [wordsegUtil.removeAll(w, 'aeiou') for w in wordsegUtil.words(line)] print ' Query (ins):', ' '.join(ws) print '' print ' ' + submission.insertVowels(ws, bigramCost, possibleFills) elif cmd == 'both': line = wordsegUtil.cleanLine(line) smoothCost = wordsegUtil.smoothUnigramAndBigram(unigramCost, bigramCost, 0.2) parts = [wordsegUtil.removeAll(w, 'aeiou') for w in wordsegUtil.words(line)] print ' Query (both):', ' '.join(parts) print '' print ' ' + ' '.join( submission.segmentAndInsert(part, smoothCost, possibleFills) for part in parts ) elif cmd == 'fills': line = wordsegUtil.cleanLine(line) print '\n'.join(possibleFills(line)) elif cmd == 'ug': line = wordsegUtil.cleanLine(line) print unigramCost(line) elif cmd == 'bg': grams = tuple(wordsegUtil.words(line)) prefix, ending = grams[-2], grams[-1] print bigramCost(prefix, ending) else: print 'Unrecognized command:', cmd print ''