Exemple #1
0
 def t_1b_4():
     for query in QUERIES_SEG:
         query = wordsegUtil.cleanLine(query)
         parts = wordsegUtil.words(query)
         pred = [
             submission.segmentWords(part, unigramCost) for part in parts
         ]
Exemple #2
0
def calculate(unigramCost, bigramCost):
    csv_name = FILE_PATH
    df = pd.read_csv(csv_name)

    ug = []
    bg = []

    for i in range(df.shape[0]):
        accepted_id = df['ParentAcceptedAnswerId'][i]
        if np.isnan(accepted_id):
            continue

        line = df['Body'][i]

        # Unigram cost.
        grams=tuple(wordsegUtil.cleanLine(line))
        cost=0
        for j in range(len(grams)):
            cost+=unigramCost(grams[j])
        cost=cost/len(grams)
        ug.append(cost)

        cost=bigramCost(wordsegUtil.SENTENCE_BEGIN, grams[0])
        for j in range(1,len(grams)):
            cost+=bigramCost(grams[j-1], grams[j])
        cost=cost/len(grams)
        bg.append(cost)


    # create a output panda dataframe
    output = pd.DataFrame(list(zip(ug,bg)), columns =['unigramCost','bigramCost'])


    output_name = csv_name.replace('.csv','_fluency.csv')
    output.to_csv(output_name)
Exemple #3
0
 def t_3b_5():
     unigramCost, bigramCost, possibleFills = getRealCosts()
     smoothCost = wordsegUtil.smoothUnigramAndBigram(unigramCost, bigramCost, 0.2)
     for query in QUERIES_INS:
         query = wordsegUtil.cleanLine(query)
         parts = [wordsegUtil.removeAll(w, 'aeiou') for w in wordsegUtil.words(query)]
         pred = [submission.segmentAndInsert(part, smoothCost, possibleFills) for part in parts]
Exemple #4
0
 def test_3(self):
     """1b-3-hidden:  hidden test case for all queries in QUERIES_SEG"""
     for query in QUERIES_SEG:
         query = wordsegUtil.cleanLine(query)
         parts = wordsegUtil.words(query)
         self.compare_with_solution_or_wait(
             submission, 'segmentWords',
             lambda f: [f(part, self.unigramCost) for part in parts])
Exemple #5
0
 def t_2b_4():
     for query in QUERIES_INS:
         query = wordsegUtil.cleanLine(query)
         ws = [
             wordsegUtil.removeAll(w, 'aeiou')
             for w in wordsegUtil.words(query)
         ]
         pred = submission.insertVowels(ws, bigramCost, possibleFills)
Exemple #6
0
 def test_4(self):
   """3b-4-hidden:  hidden test case for all queries in QUERIES_BOTH with bigram costs and possible fills from the corpus"""
   smoothCost = wordsegUtil.smoothUnigramAndBigram(self.unigramCost, self.bigramCost, 0.2)
   for i, query in enumerate(QUERIES_BOTH):
     if i != 1:
       continue
     query = wordsegUtil.cleanLine(query)
     parts = [wordsegUtil.removeAll(w, 'aeiou') for w in wordsegUtil.words(query)]
     self.compare_with_solution_or_wait(submission, 'segmentAndInsert', lambda f: [f(part, smoothCost, self.possibleFills) for part in parts])
Exemple #7
0
def parser(file_path,file):
    csv_name = file_path
    df = pd.read_csv(csv_name)


    for i in range(df.shape[0]): # go through each answer post
        if df['Id'][i]==df['ParentAcceptedAnswerId'][i]:
            body = df['Body'][i]
            line=wordsegUtil.cleanLine(body)
            file.write(line)
Exemple #8
0
 def test_3(self):
     """2b-3-hidden:  hidden test case for all queries in QUERIES_INS"""
     for query in QUERIES_INS:
         query = wordsegUtil.cleanLine(query)
         ws = [
             wordsegUtil.removeAll(w, 'aeiou')
             for w in wordsegUtil.words(query)
         ]
         self.compare_with_solution_or_wait(
             submission, 'insertVowels', lambda f: f(
                 copy.deepcopy(ws), self.bigramCost, self.possibleFills))
Exemple #9
0
def repl(unigramCost, bigramCost):
    '''REPL: read, evaluate, print, loop'''

    while True:
        sys.stdout.write('>> ')
        line = sys.stdin.readline().strip()
        if not line:
            break

        cmdAndLine = line.split(None, 1)
        cmd, line = cmdAndLine[0], ' '.join(cmdAndLine[1:])

        print('')

        if cmd == 'help':
            print('Usage: <command> [arg1, arg2, ...]')
            print('')
            print('Commands:')
            print(('\n'.join(a + '\t\t' + b for a, b in [
                ('help', 'This'),
                ('ug', 'Query unigram cost function, treating input as a single word'),
                ('bg', 'Call bigram cost function on the last two words of the input'),
            ])))
            print('')
            print('Enter empty line to quit')

        elif cmd == 'ug':
            grams=tuple(wordsegUtil.cleanLine(line))
            cost=0
            for i in range(len(grams)):
                cost+=unigramCost(grams[i])
            cost=cost/len(grams)
            print(cost)

        elif cmd == 'bg':
            grams = tuple(wordsegUtil.words(line))
            if len(grams)<2:
                print("Text too short. Enter again.")
                print('')
                continue
            cost=bigramCost(wordsegUtil.SENTENCE_BEGIN, grams[0])
            for i in range(1,len(grams)):
                cost+=bigramCost(grams[i-1], grams[i])
            cost=cost/len(grams)
            print(cost)

        else:
            print(('Unrecognized command:', cmd))

        print('')
Exemple #10
0
def repl(unigramCost, bigramCost, possibleFills, command=None):
    '''REPL: read, evaluate, print, loop'''

    while True:
        sys.stdout.write('>> ')
        line = sys.stdin.readline().strip()
        if not line:
            break

        if command is None:
            cmdAndLine = line.split(None, 1)
            cmd, line = cmdAndLine[0], ' '.join(cmdAndLine[1:])
        else:
            cmd = command
            line = line

        print ('')

        if cmd == 'help':
            print ('Usage: <command> [arg1, arg2, ...]')
            print ('')
            print ('Commands:')
            print ('\n'.join(a + '\t\t' + b for a, b in [
                ('help', 'This'),
                ('seg', 'Segment character sequences as in 1b'),
                ('ins', 'Insert vowels into words as in 2b'),
                ('both', 'Joint segment-and-insert as in 3b'),
                ('fills', 'Query possibleFills() to see possible vowel-fillings of a word'),
                ('ug', 'Query unigram cost function, treating input as a single word'),
                ('bg', 'Call bigram cost function on the last two words of the input'),
            ]))
            print ('')
            print ('Enter empty line to quit')

        elif cmd == 'seg':
            line = wordsegUtil.cleanLine(line)
            parts = wordsegUtil.words(line)
            print ('  Query (seg):', ' '.join(parts))
            print ('')
            print ('  ' + ' '.join(
                submission.segmentWords(part, unigramCost) for part in parts))

        elif cmd == 'ins':
            line = wordsegUtil.cleanLine(line)
            ws = [wordsegUtil.removeAll(w, 'aeiou') for w in wordsegUtil.words(line)]
            print ('  Query (ins):', ' '.join(ws))
            print ('')
            print ('  ' + submission.insertVowels(ws, bigramCost, possibleFills))

        elif cmd == 'both':
            line = wordsegUtil.cleanLine(line)
            smoothCost = wordsegUtil.smoothUnigramAndBigram(unigramCost, bigramCost, 0.2)
            parts = [wordsegUtil.removeAll(w, 'aeiou') for w in wordsegUtil.words(line)]
            print ('  Query (both):', ' '.join(parts))
            print ('')
            print ('  ' + ' '.join(
                submission.segmentAndInsert(part, smoothCost, possibleFills)
                for part in parts
            ))

        elif cmd == 'fills':
            line = wordsegUtil.cleanLine(line)
            print ('\n'.join(possibleFills(line)))

        elif cmd == 'ug':
            line = wordsegUtil.cleanLine(line)
            print (unigramCost(line))

        elif cmd == 'bg':
            grams = tuple(wordsegUtil.words(line))
            prefix, ending = grams[-2], grams[-1]
            print (bigramCost(prefix, ending))

        else:
            print ('Unrecognized command:', cmd)

        print ('')
Exemple #11
0
def repl(unigramCost, bigramCost, possibleFills, command=None):
    '''REPL: read, evaluate, print, loop'''

    while True:
        sys.stdout.write('>> ')
        line = sys.stdin.readline().strip()
        if not line:
            break

        if command is None:
            cmdAndLine = line.split(None, 1)
            cmd, line = cmdAndLine[0], ' '.join(cmdAndLine[1:])
        else:
            cmd = command
            line = line

        print ''

        if cmd == 'help':
            print 'Usage: <command> [arg1, arg2, ...]'
            print ''
            print 'Commands:'
            print '\n'.join(a + '\t\t' + b for a, b in [
                ('help', 'This'),
                ('seg', 'Segment character sequences'),
                ('ins', 'Insert vowels into words'),
                ('both', 'Joint segment-and-insert'),
                ('fills', 'Query possibleFills() to see possible vowel-fillings of a word'),
                ('ug', 'Query unigram cost function'),
                ('bg', 'Query bigram cost function'),
            ])
            print ''
            print 'Enter empty line to quit'

        elif cmd == 'seg':
            line = wordsegUtil.cleanLine(line)
            parts = wordsegUtil.words(line)
            print '  Query (seg):', ' '.join(parts)
            print ''
            print '  ' + ' '.join(
                submission.segmentWords(part, unigramCost) for part in parts)

        elif cmd == 'ins':
            line = wordsegUtil.cleanLine(line)
            ws = [wordsegUtil.removeAll(w, 'aeiou') for w in wordsegUtil.words(line)]
            print '  Query (ins):', ' '.join(ws)
            print ''
            print '  ' + submission.insertVowels(ws, bigramCost, possibleFills)

        elif cmd == 'both':
            line = wordsegUtil.cleanLine(line)
            smoothCost = wordsegUtil.smoothUnigramAndBigram(unigramCost, bigramCost, 0.2)
            parts = [wordsegUtil.removeAll(w, 'aeiou') for w in wordsegUtil.words(line)]
            print '  Query (both):', ' '.join(parts)
            print ''
            print '  ' + ' '.join(
                submission.segmentAndInsert(part, smoothCost, possibleFills)
                for part in parts
            )

        elif cmd == 'fills':
            line = wordsegUtil.cleanLine(line)
            print '\n'.join(possibleFills(line))

        elif cmd == 'ug':
            line = wordsegUtil.cleanLine(line)
            print unigramCost(line)

        elif cmd == 'bg':
            grams = tuple(wordsegUtil.words(line))
            prefix, ending = grams[-2], grams[-1]
            print bigramCost(prefix, ending)

        else:
            print 'Unrecognized command:', cmd

        print ''