Ejemplo n.º 1
0
    def test_0(self):
        """2b-0-basic:  simple test case"""
        def bigramCost(a, b):
            corpus = [wordsegUtil.SENTENCE_BEGIN] + 'beam me up scotty'.split()
            if (a, b) in list(zip(corpus, corpus[1:])):
                return 1.0
            else:
                return 1000.0

        def possibleFills(x):
            fills = {
                'bm': set(['beam', 'bam', 'boom']),
                'm': set(['me', 'ma']),
                'p': set(['up', 'oop', 'pa', 'epe']),
                'sctty': set(['scotty']),
            }
            return fills.get(x, set())

        self.assertEqual(
            '', submission.insertVowels([], bigramCost, possibleFills))
        self.assertEqual(  # No fills
            'zz$z$zz',
            submission.insertVowels(['zz$z$zz'], bigramCost, possibleFills))
        self.assertEqual(
            'beam', submission.insertVowels(['bm'], bigramCost, possibleFills))
        self.assertEqual(
            'me up',
            submission.insertVowels(['m', 'p'], bigramCost, possibleFills))
        self.assertEqual(
            'beam me up scotty',
            submission.insertVowels('bm m p sctty'.split(), bigramCost,
                                    possibleFills))
Ejemplo n.º 2
0
 def test_1(self):
   """2b-1-hidden:  Simple hidden test case"""
   solution1 = submission.insertVowels([], self.bigramCost, self.possibleFills)
   # No fills
   solution2 = submission.insertVowels(['zz$z$zz'], self.bigramCost, self.possibleFills)
   solution3 = submission.insertVowels([''], self.bigramCost, self.possibleFills)
   solution4 = submission.insertVowels('wld lk t hv mr lttrs'.split(), self.bigramCost, self.possibleFills)
   solution5 = submission.insertVowels('ngh lrdy'.split(), self.bigramCost, self.possibleFills)
Ejemplo n.º 3
0
def repl(unigramCost, bigramCost, possibleFills, command=None):
    '''REPL: read, evaluate, print, loop'''

    while True:
        sys.stdout.write('>> ')
        line = sys.stdin.readline().strip()
        if not line:
            break

        if command is None:
            cmdAndLine = line.split(None, 1)
            cmd, line = cmdAndLine[0], ' '.join(cmdAndLine[1:])
        else:
            cmd = command
            line = line

        print ('')

        if cmd == 'help':
            print ('Usage: <command> [arg1, arg2, ...]')
            print ('')
            print ('Commands:')
            print ('\n'.join(a + '\t\t' + b for a, b in [
                ('help', 'This'),
                ('seg', 'Segment character sequences as in 1b'),
                ('ins', 'Insert vowels into words as in 2b'),
                ('both', 'Joint segment-and-insert as in 3b'),
                ('fills', 'Query possibleFills() to see possible vowel-fillings of a word'),
                ('ug', 'Query unigram cost function, treating input as a single word'),
                ('bg', 'Call bigram cost function on the last two words of the input'),
            ]))
            print ('')
            print ('Enter empty line to quit')

        elif cmd == 'seg':
            line = wordsegUtil.cleanLine(line)
            parts = wordsegUtil.words(line)
            print ('  Query (seg):', ' '.join(parts))
            print ('')
            print ('  ' + ' '.join(
                submission.segmentWords(part, unigramCost) for part in parts))

        elif cmd == 'ins':
            line = wordsegUtil.cleanLine(line)
            ws = [wordsegUtil.removeAll(w, 'aeiou') for w in wordsegUtil.words(line)]
            print ('  Query (ins):', ' '.join(ws))
            print ('')
            print ('  ' + submission.insertVowels(ws, bigramCost, possibleFills))

        elif cmd == 'both':
            line = wordsegUtil.cleanLine(line)
            smoothCost = wordsegUtil.smoothUnigramAndBigram(unigramCost, bigramCost, 0.2)
            parts = [wordsegUtil.removeAll(w, 'aeiou') for w in wordsegUtil.words(line)]
            print ('  Query (both):', ' '.join(parts))
            print ('')
            print ('  ' + ' '.join(
                submission.segmentAndInsert(part, smoothCost, possibleFills)
                for part in parts
            ))

        elif cmd == 'fills':
            line = wordsegUtil.cleanLine(line)
            print ('\n'.join(possibleFills(line)))

        elif cmd == 'ug':
            line = wordsegUtil.cleanLine(line)
            print (unigramCost(line))

        elif cmd == 'bg':
            grams = tuple(wordsegUtil.words(line))
            prefix, ending = grams[-2], grams[-1]
            print (bigramCost(prefix, ending))

        else:
            print ('Unrecognized command:', cmd)

        print ('')
Ejemplo n.º 4
0
    def test_2(self):
        """2b-2-hidden:  Simple hidden test case."""
        SB = wordsegUtil.SENTENCE_BEGIN

        # Check for correct use of SENTENCE_BEGIN
        def bigramCost(a, b):
            if (a, b) == (SB, 'cat'):
                return 5.0
            elif a != SB and b == 'dog':
                return 1.0
            else:
                return 1000.0

        solution1 = submission.insertVowels(['x'], bigramCost,
                                            lambda x: set(['cat', 'dog']))

        # Check for non-greediness
        def bigramCost(a, b):
            # Dog over log -- a test poem by rf
            costs = {
                (SB, 'cat'): 1.0,  # Always start with cat
                ('cat', 'log'): 1.0,  # Locally prefer log
                ('cat', 'dog'): 2.0,  # rather than dog
                ('log', 'mouse'): 3.0,  # But dog would have been
                ('dog', 'mouse'): 1.0,  # better in retrospect
            }
            return costs.get((a, b), 1000.0)

        def fills(x):
            return {
                'x1': set(['cat', 'dog']),
                'x2': set(['log', 'dog', 'frog']),
                'x3': set(['mouse', 'house', 'cat'])
            }[x]

        solution2 = submission.insertVowels('x1 x2 x3'.split(), bigramCost,
                                            fills)

        # Check for non-trivial long-range dependencies
        def bigramCost(a, b):
            # Dogs over logs -- another test poem by rf
            costs = {
                (SB, 'cat'): 1.0,  # Always start with cat
                ('cat', 'log1'): 1.0,  # Locally prefer log
                ('cat', 'dog1'): 2.0,  # Rather than dog
                ('log20', 'mouse'): 1.0,  # And this might even
                ('dog20', 'mouse'): 1.0,  # seem to be okay
            }
            for i in range(1, 20):  # But along the way
                #                               Dog's cost will decay
                costs[('log' + str(i), 'log' + str(i + 1))] = 0.25
                costs[('dog' + str(i), 'dog' + str(i + 1))] = 1.0 / float(i)
            #                               Hooray
            return costs.get((a, b), 1000.0)

        def fills(x):
            f = {
                'x0': set(['cat', 'dog']),
                'x21': set(['mouse', 'house', 'cat']),
            }
            for i in range(1, 21):
                f['x' + str(i)] = set(['log' + str(i), 'dog' + str(i), 'frog'])
            return f[x]

        solution3 = submission.insertVowels(
            ['x' + str(i) for i in range(0, 22)], bigramCost, fills)
Ejemplo n.º 5
0
def repl(unigramCost, bigramCost, possibleFills, command=None):
    '''REPL: read, evaluate, print, loop'''

    while True:
        sys.stdout.write('>> ')
        line = sys.stdin.readline().strip()
        if not line:
            break

        if command is None:
            cmdAndLine = line.split(None, 1)
            cmd, line = cmdAndLine[0], ' '.join(cmdAndLine[1:])
        else:
            cmd = command
            line = line

        print ''

        if cmd == 'help':
            print 'Usage: <command> [arg1, arg2, ...]'
            print ''
            print 'Commands:'
            print '\n'.join(a + '\t\t' + b for a, b in [
                ('help', 'This'),
                ('seg', 'Segment character sequences'),
                ('ins', 'Insert vowels into words'),
                ('both', 'Joint segment-and-insert'),
                ('fills', 'Query possibleFills() to see possible vowel-fillings of a word'),
                ('ug', 'Query unigram cost function'),
                ('bg', 'Query bigram cost function'),
            ])
            print ''
            print 'Enter empty line to quit'

        elif cmd == 'seg':
            line = wordsegUtil.cleanLine(line)
            parts = wordsegUtil.words(line)
            print '  Query (seg):', ' '.join(parts)
            print ''
            print '  ' + ' '.join(
                submission.segmentWords(part, unigramCost) for part in parts)

        elif cmd == 'ins':
            line = wordsegUtil.cleanLine(line)
            ws = [wordsegUtil.removeAll(w, 'aeiou') for w in wordsegUtil.words(line)]
            print '  Query (ins):', ' '.join(ws)
            print ''
            print '  ' + submission.insertVowels(ws, bigramCost, possibleFills)

        elif cmd == 'both':
            line = wordsegUtil.cleanLine(line)
            smoothCost = wordsegUtil.smoothUnigramAndBigram(unigramCost, bigramCost, 0.2)
            parts = [wordsegUtil.removeAll(w, 'aeiou') for w in wordsegUtil.words(line)]
            print '  Query (both):', ' '.join(parts)
            print ''
            print '  ' + ' '.join(
                submission.segmentAndInsert(part, smoothCost, possibleFills)
                for part in parts
            )

        elif cmd == 'fills':
            line = wordsegUtil.cleanLine(line)
            print '\n'.join(possibleFills(line))

        elif cmd == 'ug':
            line = wordsegUtil.cleanLine(line)
            print unigramCost(line)

        elif cmd == 'bg':
            grams = tuple(wordsegUtil.words(line))
            prefix, ending = grams[-2], grams[-1]
            print bigramCost(prefix, ending)

        else:
            print 'Unrecognized command:', cmd

        print ''