def t_1b_4(): for query in QUERIES_SEG: query = wordsegUtil.cleanLine(query) parts = wordsegUtil.words(query) pred = [ submission.segmentWords(part, unigramCost) for part in parts ]
def calculate(unigramCost, bigramCost): csv_name = FILE_PATH df = pd.read_csv(csv_name) ug = [] bg = [] for i in range(df.shape[0]): accepted_id = df['ParentAcceptedAnswerId'][i] if np.isnan(accepted_id): continue line = df['Body'][i] # Unigram cost. grams=tuple(wordsegUtil.cleanLine(line)) cost=0 for j in range(len(grams)): cost+=unigramCost(grams[j]) cost=cost/len(grams) ug.append(cost) cost=bigramCost(wordsegUtil.SENTENCE_BEGIN, grams[0]) for j in range(1,len(grams)): cost+=bigramCost(grams[j-1], grams[j]) cost=cost/len(grams) bg.append(cost) # create a output panda dataframe output = pd.DataFrame(list(zip(ug,bg)), columns =['unigramCost','bigramCost']) output_name = csv_name.replace('.csv','_fluency.csv') output.to_csv(output_name)
def t_3b_5(): unigramCost, bigramCost, possibleFills = getRealCosts() smoothCost = wordsegUtil.smoothUnigramAndBigram(unigramCost, bigramCost, 0.2) for query in QUERIES_INS: query = wordsegUtil.cleanLine(query) parts = [wordsegUtil.removeAll(w, 'aeiou') for w in wordsegUtil.words(query)] pred = [submission.segmentAndInsert(part, smoothCost, possibleFills) for part in parts]
def test_3(self): """1b-3-hidden: hidden test case for all queries in QUERIES_SEG""" for query in QUERIES_SEG: query = wordsegUtil.cleanLine(query) parts = wordsegUtil.words(query) self.compare_with_solution_or_wait( submission, 'segmentWords', lambda f: [f(part, self.unigramCost) for part in parts])
def t_2b_4(): for query in QUERIES_INS: query = wordsegUtil.cleanLine(query) ws = [ wordsegUtil.removeAll(w, 'aeiou') for w in wordsegUtil.words(query) ] pred = submission.insertVowels(ws, bigramCost, possibleFills)
def test_4(self): """3b-4-hidden: hidden test case for all queries in QUERIES_BOTH with bigram costs and possible fills from the corpus""" smoothCost = wordsegUtil.smoothUnigramAndBigram(self.unigramCost, self.bigramCost, 0.2) for i, query in enumerate(QUERIES_BOTH): if i != 1: continue query = wordsegUtil.cleanLine(query) parts = [wordsegUtil.removeAll(w, 'aeiou') for w in wordsegUtil.words(query)] self.compare_with_solution_or_wait(submission, 'segmentAndInsert', lambda f: [f(part, smoothCost, self.possibleFills) for part in parts])
def parser(file_path,file): csv_name = file_path df = pd.read_csv(csv_name) for i in range(df.shape[0]): # go through each answer post if df['Id'][i]==df['ParentAcceptedAnswerId'][i]: body = df['Body'][i] line=wordsegUtil.cleanLine(body) file.write(line)
def test_3(self): """2b-3-hidden: hidden test case for all queries in QUERIES_INS""" for query in QUERIES_INS: query = wordsegUtil.cleanLine(query) ws = [ wordsegUtil.removeAll(w, 'aeiou') for w in wordsegUtil.words(query) ] self.compare_with_solution_or_wait( submission, 'insertVowels', lambda f: f( copy.deepcopy(ws), self.bigramCost, self.possibleFills))
def repl(unigramCost, bigramCost): '''REPL: read, evaluate, print, loop''' while True: sys.stdout.write('>> ') line = sys.stdin.readline().strip() if not line: break cmdAndLine = line.split(None, 1) cmd, line = cmdAndLine[0], ' '.join(cmdAndLine[1:]) print('') if cmd == 'help': print('Usage: <command> [arg1, arg2, ...]') print('') print('Commands:') print(('\n'.join(a + '\t\t' + b for a, b in [ ('help', 'This'), ('ug', 'Query unigram cost function, treating input as a single word'), ('bg', 'Call bigram cost function on the last two words of the input'), ]))) print('') print('Enter empty line to quit') elif cmd == 'ug': grams=tuple(wordsegUtil.cleanLine(line)) cost=0 for i in range(len(grams)): cost+=unigramCost(grams[i]) cost=cost/len(grams) print(cost) elif cmd == 'bg': grams = tuple(wordsegUtil.words(line)) if len(grams)<2: print("Text too short. Enter again.") print('') continue cost=bigramCost(wordsegUtil.SENTENCE_BEGIN, grams[0]) for i in range(1,len(grams)): cost+=bigramCost(grams[i-1], grams[i]) cost=cost/len(grams) print(cost) else: print(('Unrecognized command:', cmd)) print('')
def repl(unigramCost, bigramCost, possibleFills, command=None): '''REPL: read, evaluate, print, loop''' while True: sys.stdout.write('>> ') line = sys.stdin.readline().strip() if not line: break if command is None: cmdAndLine = line.split(None, 1) cmd, line = cmdAndLine[0], ' '.join(cmdAndLine[1:]) else: cmd = command line = line print ('') if cmd == 'help': print ('Usage: <command> [arg1, arg2, ...]') print ('') print ('Commands:') print ('\n'.join(a + '\t\t' + b for a, b in [ ('help', 'This'), ('seg', 'Segment character sequences as in 1b'), ('ins', 'Insert vowels into words as in 2b'), ('both', 'Joint segment-and-insert as in 3b'), ('fills', 'Query possibleFills() to see possible vowel-fillings of a word'), ('ug', 'Query unigram cost function, treating input as a single word'), ('bg', 'Call bigram cost function on the last two words of the input'), ])) print ('') print ('Enter empty line to quit') elif cmd == 'seg': line = wordsegUtil.cleanLine(line) parts = wordsegUtil.words(line) print (' Query (seg):', ' '.join(parts)) print ('') print (' ' + ' '.join( submission.segmentWords(part, unigramCost) for part in parts)) elif cmd == 'ins': line = wordsegUtil.cleanLine(line) ws = [wordsegUtil.removeAll(w, 'aeiou') for w in wordsegUtil.words(line)] print (' Query (ins):', ' '.join(ws)) print ('') print (' ' + submission.insertVowels(ws, bigramCost, possibleFills)) elif cmd == 'both': line = wordsegUtil.cleanLine(line) smoothCost = wordsegUtil.smoothUnigramAndBigram(unigramCost, bigramCost, 0.2) parts = [wordsegUtil.removeAll(w, 'aeiou') for w in wordsegUtil.words(line)] print (' Query (both):', ' '.join(parts)) print ('') print (' ' + ' '.join( submission.segmentAndInsert(part, smoothCost, possibleFills) for part in parts )) elif cmd == 'fills': line = wordsegUtil.cleanLine(line) print ('\n'.join(possibleFills(line))) elif cmd == 'ug': line = wordsegUtil.cleanLine(line) print (unigramCost(line)) elif cmd == 'bg': grams = tuple(wordsegUtil.words(line)) prefix, ending = grams[-2], grams[-1] print (bigramCost(prefix, ending)) else: print ('Unrecognized command:', cmd) print ('')
def repl(unigramCost, bigramCost, possibleFills, command=None): '''REPL: read, evaluate, print, loop''' while True: sys.stdout.write('>> ') line = sys.stdin.readline().strip() if not line: break if command is None: cmdAndLine = line.split(None, 1) cmd, line = cmdAndLine[0], ' '.join(cmdAndLine[1:]) else: cmd = command line = line print '' if cmd == 'help': print 'Usage: <command> [arg1, arg2, ...]' print '' print 'Commands:' print '\n'.join(a + '\t\t' + b for a, b in [ ('help', 'This'), ('seg', 'Segment character sequences'), ('ins', 'Insert vowels into words'), ('both', 'Joint segment-and-insert'), ('fills', 'Query possibleFills() to see possible vowel-fillings of a word'), ('ug', 'Query unigram cost function'), ('bg', 'Query bigram cost function'), ]) print '' print 'Enter empty line to quit' elif cmd == 'seg': line = wordsegUtil.cleanLine(line) parts = wordsegUtil.words(line) print ' Query (seg):', ' '.join(parts) print '' print ' ' + ' '.join( submission.segmentWords(part, unigramCost) for part in parts) elif cmd == 'ins': line = wordsegUtil.cleanLine(line) ws = [wordsegUtil.removeAll(w, 'aeiou') for w in wordsegUtil.words(line)] print ' Query (ins):', ' '.join(ws) print '' print ' ' + submission.insertVowels(ws, bigramCost, possibleFills) elif cmd == 'both': line = wordsegUtil.cleanLine(line) smoothCost = wordsegUtil.smoothUnigramAndBigram(unigramCost, bigramCost, 0.2) parts = [wordsegUtil.removeAll(w, 'aeiou') for w in wordsegUtil.words(line)] print ' Query (both):', ' '.join(parts) print '' print ' ' + ' '.join( submission.segmentAndInsert(part, smoothCost, possibleFills) for part in parts ) elif cmd == 'fills': line = wordsegUtil.cleanLine(line) print '\n'.join(possibleFills(line)) elif cmd == 'ug': line = wordsegUtil.cleanLine(line) print unigramCost(line) elif cmd == 'bg': grams = tuple(wordsegUtil.words(line)) prefix, ending = grams[-2], grams[-1] print bigramCost(prefix, ending) else: print 'Unrecognized command:', cmd print ''