words = line.rstrip("\n") if words != " " and words not in data_set: data_set.append(words) return data_set def check(filename, word): dict = {} with open(filename) as file: data = file.readlines() for line in data: words = line.rstrip("\n").split(" ") if word == words[1]: dict[words[0]] = float(words[2]) return dict def argmax(alist): for key in alist.keys(): if alist[key] == min(alist.values()): return key testing = "POS/Part 5/test.in" emission_count = "POS/Part 5/emission_train_count.txt" state_count = "POS/Part 5/emission_count.txt" filetest = testing_splitter(testing, mode="unique") bjos, predicts = gen_bjos(state_count, filetest, emission_count) output_to_file(bjos, "POS/Part 5/emission_testing.txt") tagger(predicts, "POS/Part 5/emission_testing_tags.txt")
data_set = [] with open(filename) as file: data = file.readlines() for line in data: words = line.strip() if words != " " and words not in data_set: data_set.append(words) return data_set def check(filename, word): dict = {} with open(filename) as file: data = file.readlines() for line in data: words = line.strip().split(" ") if word == words[1]: dict[words[0]] = float(words[2]) return dict def argmax(alist): for key in alist.keys(): if alist[key] == min(alist.values()): return key testing = "../dev.in" emission_count = "../Part 3/emission_train_count.txt" state_count = "../Part 3/emission_count.txt" filetest = testing_splitter(testing, mode="unique") bjos, predicts = gen_bjos(state_count, filetest, emission_count) output_to_file(bjos, "part5_emission_testing.txt")
dict[words[1]][words[0]] += 1 return dict def count_all_states(word_count): states = {} for key in word_count.keys(): states[key] = sum(word_count[key].values()) return states # Computes the bjos def gen_bjo(word_count): dict = {} state_count = count_all_states(word_count) for key in word_count.keys(): dict[key] = {} for words in word_count[key].keys(): dict[key][words] = word_count[key][words]*1.0 / state_count[key] return dict training = "POS/train" word_count = count_word(training) bjo = gen_bjo(word_count) #Outputs readable format output_to_file(bjo, "POS/Part 5/emission_trainingReadable.txt", mode="readable") #Outputs normal format (Emission Probability) output_to_file(bjo, "POS/Part 5/emission_training.txt") #Outputs normal format (Word count for each Tag) output_to_file(count_word(training), "POS/Part 5/emission_train_count.txt") # output_to_file(count_all_states(word_count), "POS/state_count.txt") output_to_file(count_all_states(word_count), "POS/Part 5/emission_count.txt", mode="state")
data = file.readlines() for line in data: words = line.rstrip("\n") if words != " " and words not in data_set: data_set.append(words) return data_set def check(filename, word): dict = {} with open(filename) as file: data = file.readlines() for line in data: words = line.rstrip("\n").split(" ") if word == words[1]: dict[words[0]] = float(words[2]) return dict def argmax(alist): for key in alist.keys(): if alist[key] == min(alist.values()): return key testing = "POS/Part 5/test.in" emission_count = "POS/Part 5/emission_train_count.txt" state_count = "POS/Part 5/emission_count.txt" filetest = testing_splitter(testing, mode="unique") bjos, predicts = gen_bjos(state_count, filetest, emission_count) output_to_file(bjos, "POS/Part 5/emission_testing.txt") tagger(predicts, "POS/Part 5/emission_testing_tags.txt")
for line in data: words = line.strip() if words != " " and words not in data_set: data_set.append(words) return data_set def check(filename, word): dict = {} with open(filename) as file: data = file.readlines() for line in data: words = line.strip().split(" ") if word == words[1]: dict[words[0]] = float(words[2]) return dict def argmax(alist): for key in alist.keys(): if alist[key] == min(alist.values()): return key testing = "../dev.in" emission_count = "../Part 3/emission_train_count.txt" state_count = "../Part 3/emission_count.txt" filetest = testing_splitter(testing, mode="unique") bjos, predicts = gen_bjos(state_count, filetest, emission_count) output_to_file(bjos, "part5_emission_testing.txt")