def compute_accuracy(self, hmm, evaluation, ignore_tags=set([])): """ Compute the accuracy of hmm tags on an validation dictionary. """ V = Viterbi(hmm) accuracy = 0.0 for item in evaluation: sentence, validation_tags = item hmm_tags = V.compute_best_parse(sentence)[1] accuracy += self.accuracy(hmm_tags, validation_tags, ignore_tags) total_accuracy = accuracy/len(evaluation) return total_accuracy
def infer_prepare_params(basic_or_complex, fileToInfer): train_parser = MyParser("../train.wtag") seenWordsToTagsDict = train_parser.getSeenWordsToTagsDict() fb, filePrefix = None, None if basic_or_complex == 'basic': fb = BasicFeatureVectorBuilder(train_parser, 0) filePrefix = 'finish_basic_opt_v_' elif basic_or_complex == 'complex': fb = ComplexFeatureVectorBuilder(train_parser, False) filePrefix = 'finish_complex_opt_v_' else: assert (False) fn = str(fileToInfer).replace('.', '').replace('/', '') parser = MyParser(fileToInfer) splitted = parser.splitted mle = MLE(train_parser.getUniqueTags(), splitted, fb) prefixed = [ filename for filename in os.listdir('.') if filename.startswith(filePrefix) ] prefixed.sort() print(prefixed) results = [] for v_file in prefixed: v = np.loadtxt(v_file) vit = Viterbi(mle, mle.allTags, v, seenWordsToTagsDict) res_file = open(fn + "_results_" + v_file, 'w') exp_file = open(fn + "_expected_" + v_file, 'w') accuracy = infer_aux(exp_file, res_file, v_file, splitted, vit) res_file.close() exp_file.close() results = results + [accuracy] infer_aux_results(prefixed, results, fileToInfer, fn)
def __init__(self, language): self.total_labels = [] self.klasses = [] self.language = language self.train_sentences = [] self.test_sentenses = [] self.factory = FeatureFactory() self.viterbi = Viterbi()
def train(): train_parser = MyParser("../train.wtag") seenSentencesToTagsDict = train_parser.getSeenWordsToTagsDict() parser = MyParser("../comp748.wtag") splitted = parser.splitted fb = BasicFeatureVectorBuilder(parser,0) mle = MLE(parser.getUniqueTags(), splitted, fb) v = np.loadtxt("opt_v_3.txt") sentences = list(map(lambda tuples: [t[0] for t in tuples], splitted)) expected_tags = list(map(lambda tuples: [t[1] for t in tuples], splitted)) seenSentencesToTagsDict = parser.getSeenWordsToTagsDict() vit = Viterbi(mle, mle.allTags, v, seenSentencesToTagsDict) total_res = 0 words_count = 0 total_time = 0 for s,expected,idx in zip(sentences,expected_tags,range(0,len(splitted))): curr_word_len = len(s) words_count = words_count + curr_word_len start = time.time() tags = vit.inference(s) res_file = open("test_wtag748_results.txt",'a') for item in tags: res_file.write("%s " % item) res_file.write("\n") res_file.close() exp_file = open("test_wtag748_expected.txt", 'a') for item in expected: exp_file.write("%s " % item) exp_file.write("\n") exp_file.close() stop = time.time() e = np.array([hash(x) for x in expected]) t = np.array([hash(x) for x in tags]) current_correct = np.sum(e == t) print("---------------------") print("Inference for sentence# ", idx, " took: ", stop - start, " seconds") total_time = total_time + (stop-start) print("Current sentence accuracy: ", current_correct, " of: ", curr_word_len) total_res = total_res + current_correct print("Total sentence accuracy: ", total_res, " of: ", words_count, "=", (100*total_res)/words_count, "%") print("Total time for ", idx, " sentences: ", (total_time / 60), " minutes")
def load_model(self): print("Loading model") model = dill.load(open(f"{config.MODEL}/model.dill", 'rb')) words = model['words'] words_inverse = model['words_inverse'] tree = model['tree'] viterbi = Viterbi(words, words_inverse, tree) print("Ready.") self.viterbi = viterbi self.words = words self.words_inverse = words_inverse self.tree = tree
def run_viterbi(self, test_data, training_data, pass_number): wrong_count = 0 total_observations = 0 #Removing tags in split test data until start of new sentence. #print test_data for j in test_data: if test_data[0] == ('SOS', 'SOS'): print('Found SOS tag, continuing with test data.') break else: print('Removing item: %s' % ' / '.join(test_data[0])) test_data.pop(0) stripped_test_data = [x[0] for x in test_data] print("Test data length: %s" % len(stripped_test_data)) word_table = ProbabilityCounter().generate_word_pr_table( training_data, 1) cat_table = ProbabilityCounter().generate_cat_pr_table( training_data, 1) tag_list = list(cat_table.columns.values) #print cat_table.to_string() #print word_table.to_string() #print 'Tag List: %s' % tag_list print('Current WrongCount: %s' % wrong_count) print('Current Total Tags: %s' % total_observations) if total_observations > 0: print( 'Current Tagging Accuracy: %.2f%%' % ((1 - float(wrong_count) / total_observations) * 100) ) #TODO True accuracy measure may need to remove the added SOS tags print('Running Viterbi algorithm pass number: %d' % (pass_number + 1)) opt = Viterbi().tagger_updated(stripped_test_data, tag_list, cat_table, word_table) true_tags = [x[1] for x in test_data] predicted_tags = [x[1] for x in opt] print(true_tags) print(predicted_tags) for j in range(0, len(predicted_tags)): if predicted_tags[j] != true_tags[j]: wrong_count += 1 total_observations += len(predicted_tags) return total_observations, wrong_count
class Perceptron: def __init__(self, language): self.total_labels = [] self.klasses = [] self.language = language self.train_sentences = [] self.test_sentenses = [] self.factory = FeatureFactory() self.viterbi = Viterbi() def read_data(self, train_file, test_file): self.read_training_data(train_file) self.read_testing_data(test_file) def read_training_data(self, train_file): list_of_training_instances = [] new_sentence = Sentence() for line in train_file: split = line.strip().split() if len(split) == 0 and new_sentence.size() != 0: if '-DOCSTART-' not in new_sentence.full_sentence: self.train_sentences.append(new_sentence) new_sentence = Sentence() else: instance = EngInstance(split[0], split[1], split[2], split[3]) list_of_training_instances.append(instance) new_sentence.add(instance) if split[3] not in self.total_labels: self.total_labels.append(split[3]) print 'total number of training instances',len(list_of_training_instances), \ 'total number of training sentences', len(self.train_sentences) self.klasses_init() self.viterbi.train(self.total_labels, self.train_sentences) def klasses_init(self): for label in self.total_labels: self.klasses.append(Klass(label)) def tag_klass(self, tag): for klass in self.klasses: if klass.tag == tag: return klass return None def read_testing_data(self, test_file): list_of_testing_instances = [] new_sentence = Sentence() for line in test_file: split = line.strip().split() if len(split) == 0 and new_sentence.size() != 0: if '-DOCSTART-' not in new_sentence.full_sentence: self.test_sentenses.append(new_sentence) new_sentence = Sentence() else: instance = EngInstance(split[0], split[1], split[2], split[3]) list_of_testing_instances.append(instance) new_sentence.add(instance) print 'total number of testing instances',len(list_of_testing_instances), \ 'total number of testing sentences', len(self.test_sentenses) def computeFeatures(self): for sentence in self.train_sentences: self.factory.compute_sentence_features_eng(sentence) for sentence in self.test_sentenses: self.factory.compute_sentence_features_eng(sentence) def train(self): iteration = 0 total = len(self.train_sentences) while iteration < 10: error = 0 for i in range(len(self.train_sentences)): sentence = self.train_sentences[i] path = self.classify(sentence) for index in range(len(sentence.instances)): instance = sentence.instances[index] if path[index] == instance.label: instance.predicted_label = instance.label else: guess = self.tag_klass(path[index]) instance.predicted_label = path[index] gold = self.tag_klass(instance.label) error += 1 guess.adjust(instance.features, '-') gold.adjust(instance.features, '+') self.factory.features_update(sentence) for klass in self.klasses: klass.update() iteration += 1 print 'Iteration %d: number of errors %d' % (iteration, error) for klass in self.klasses: klass.average_weights() def classify(self, sentence): return self.viterbi.viterbi(sentence, self.klasses) def test(self): correct = 0 wrong = 0 report_summary = defaultdict(lambda:0) for i in range(len(self.train_sentences)): sentence = self.train_sentences[i] path = self.classify(sentence) for index in range(len(sentence.instances)): instance = sentence.instances[index] instance.predicted_label = path[index] self.factory.features_update(sentence) for sentence in self.test_sentenses: path = self.classify(sentence) for index in range(len(sentence.instances)): instance = sentence.instances[index] guess = self.tag_klass(path[index]) gold = self.tag_klass(instance.label) report_summary[(gold.tag, guess.tag)] += 1 if guess.tag != gold.tag: gold.FN += 1 guess.FP += 1 wrong += 1 else: gold.TP += 1 if guess.tag != 'O': correct += 1 for label_1 in self.total_labels: print label_1, "&", print for label_1 in self.total_labels: print label_1, for label_2 in self.total_labels: print "&", report_summary[(label_1, label_2)], print "\\\\ \\hline" print correct, wrong for klass in self.klasses: try: P = float(klass.TP)/(klass.TP + klass.FP) except: P = 0 try: R = float(klass.TP)/(klass.TP + klass.FN) except: R = 0 try: F = 2 * P * R /(P + R) * 100 except: F = 0 print "%s & %.2f & %.2f & %.2f" % (klass.tag, P * 100, R * 100, F)
d[ ph48 ] = char return d PhoneMapIdxtoPh48 = load_liststateto48() PhoneMap48to39 = load_dict_48to39() PhoneMap39toChr = load_dict_48toChr() xs,IDs_utter = read_test() idNphrase=[] for idx in xrange(0,len(xs)): x = xs[idx] id_utter = IDs_utter[idx][0] y_hat = [0]*len(x) V = Viterbi (x , w , y_class , y_hat , 0) start = time.clock() y_tilde = V.main_Viterbi() end = time.clock() print "Viterbi time :" , end-start print "tilde " , np.dot(w.T , Psi( x , y_tilde )) y_temp = [PhoneMap48to39[ PhoneMapIdxtoPh48[int(ph)]] for ph in y_tilde] smooth_y = [] smooth_y1 = [] smooth_y1.append(y_temp[0]) for i in xrange(1,len(y_temp)-1): if y_temp[i-1] == y_temp[i+1] and \ y_temp[i] != y_temp[i-1]: smooth_y1.append( y_temp[i-1] ) elif y_temp[i] != y_temp[i-1] and \ y_temp[i] != y_temp[i+1]:
d[ph48] = char return d PhoneMapIdxtoPh48 = load_liststateto48() PhoneMap48to39 = load_dict_48to39() PhoneMap39toChr = load_dict_48toChr() xs, IDs_utter = read_test() idNphrase = [] for idx in xrange(0, len(xs)): x = xs[idx] id_utter = IDs_utter[idx][0] y_hat = [0] * len(x) V = Viterbi(x, w, y_class, y_hat, 0) start = time.clock() y_tilde = V.main_Viterbi() end = time.clock() print "Viterbi time :", end - start print "tilde ", np.dot(w.T, Psi(x, y_tilde)) y_temp = [PhoneMap48to39[PhoneMapIdxtoPh48[int(ph)]] for ph in y_tilde] smooth_y = [] smooth_y1 = [] smooth_y1.append(y_temp[0]) for i in xrange(1, len(y_temp) - 1): if y_temp[i-1] == y_temp[i+1] and \ y_temp[i] != y_temp[i-1]: smooth_y1.append(y_temp[i - 1]) elif y_temp[i] != y_temp[i-1] and \ y_temp[i] != y_temp[i+1]:
'HOT': { 'HOT': 0.7, 'COLD': 0.3 }, 'COLD': { 'HOT': 0.4, 'COLD': 0.6 } } emission = { 'HOT': { '1': 0.2, '2': 0.4, '3': 0.4 }, 'COLD': { '1': 0.5, '2': 0.4, '3': 0.1 } } processor = Viterbi(sequence, states, initial, transition, emission) result = processor.process() resultString = '' for r in result: if r == 'HOT': resultString += 'H' if r == 'COLD': resultString += 'C' print "The Weather forecast for Observation : ", sequence, "is", resultString
# clean up test file and gold standard data for # further processing standard = preprocess(standard) test_morphemes = preprocess(test_morphemes) # extract morphemes from test file and morpheme-tag pairs from gold standard test_morphemes, standard = get_morphemes_and_standard( test_morphemes, standard) # partition morpheme list into sentence strings test_sentences = [ sent.strip() for sent in ' '.join(test_morphemes).split(EOS) if sent ] # tag sentences! v = Viterbi() # get output, storing each tag with the morpheme # that generated it test_output = [] for sentence in test_sentences: s_morphemes = tuple(sentence.split(' ')) tagged = v.tag(sentence).split(' ') test_output.extend(zip(s_morphemes, tagged)) test_report = [] errors = 0 possible = 0 # iterate through gold standard (a list containing sublists # of variable lengths corresponding to lines in the original
testing_infrequent_words = dataPreProcessor.identify_infrequent_words_in_testing_corpus( ) testSet = dataPreProcessor.tag_capital_words(testing_infrequent_words, testSet) testSet = dataPreProcessor.tag_UNI_ing_words(testing_infrequent_words, testSet) testSet = dataPreProcessor.tag_numbers(testing_infrequent_words, testSet) # create an instance of the HHM and passed the training set to generate its parameters. hiddenMarkovModel = HiddenMarkovModel(testSet) hiddenMarkovModel.calculate_transition_prob_for_POS_tags() hiddenMarkovModel.calculate_emission_prob() unified_test_set = [tup for sent in testSet for tup in sent] test_set_tags = [t for (_, t) in unified_test_set] viterbi = Viterbi(hiddenMarkovModel) viterbi_tags = [] for test in testSet: if len(test) < 100: test_observations = [w for (w, _) in test] viterbi_tags += viterbi.tag_words(test_observations) check = [ v_tag for v_tag, t_tag in zip(viterbi_tags, test_set_tags) if v_tag == t_tag ] viterbi_accuracy = len(check) / len(test_set_tags) print("Initial tags", len(test_set_tags)) print("Correct tags", len(check)) print("Percentage", viterbi_accuracy * 100)
# Divide testing and training corpus trainSetSize = 10000 testingSetSize = 500 sentences = brown.tagged_sents(tagset='universal') trainSet = sentences[0:trainSetSize] testSet = sentences[trainSetSize:trainSetSize + testingSetSize] # Continue from where the training set stopped. # create an instance of the HHM and passed the training set to generate its parameters. hiddenMarkovModel = HiddenMarkovModel(trainSet) hiddenMarkovModel.calculate_transition_prob_for_POS_tags() hiddenMarkovModel.calculate_emission_prob() unified_test_set = [tup for sent in testSet for tup in sent] test_set_tags = [t for (_, t) in unified_test_set] viterbi = Viterbi(hiddenMarkovModel) viterbi_tags = [] for test in testSet: if len(test) < 100: test_observations = [w for (w, _) in test] viterbi_tags += viterbi.tag_words(test_observations) check = [v_tag for v_tag, t_tag in zip(viterbi_tags, test_set_tags) if v_tag == t_tag] viterbi_accuracy = len(check)/len(test_set_tags) print("Correct tags", len(check)) print("Accuracy", viterbi_accuracy * 100) print("Final Probability", viterbi.get_final_prob()) print(test_set_tags)
points = simulation.generatePointsFromStates(simulationStates, 5) diff = simulation.generateDiffFromPoints(points['x'], points['y']) f, axarr = plt.subplots(3) axarr[0].bar(range(0, 100), points['x'], color='blue') axarr[1].bar(range(0, 100), points['y'], color='red') axarr[2].bar(range(0, 100), diff, color='green') plt.show() observations = [] for i,j in zip(points['x'], points['y']): observations.append((i, j)) viterbi = Viterbi(observations, { 0: {0: .9, 1: .05, 2: .05}, 1: {0: .05, 1: .9, 2: .05}, 2: {0: .05, 1: .05, 2: .9} }, (0, 1, 2), 2) viterbiStates = viterbi.viterbi() print viterbiStates count = 0 for i,j in zip(simulationStates, viterbiStates): if i == j: count += 1 print count/float(len(simulationStates))
formattedTestFile.write("\n") else: if line == "+": string = "+ " elif "+" in line: morphemeList = line.split("+") string = " ".join(morpheme for morpheme in morphemeList) + " " else: string = line + " " formattedTestFile.write(string) testFile.close() formattedTestFile.close() # Import and run Viterbi algorithm, create list of morphemes in test file from Viterbi import Viterbi v = Viterbi() formattedTestFile = open("korean-testing-formatted.txt","r") viterbiOutputFile = open("viterbi-out.txt","w+") testMorphemeList = [] for line in formattedTestFile.readlines(): line = line.rstrip() string = v.tag(line) viterbiOutputFile.write(string) morphemes = line.split(" ") for morpheme in morphemes: testMorphemeList.append(morpheme) formattedTestFile.close() viterbiOutputFile.close() # Create list of tags in Viterbi test output
# clean up test file and gold standard data for # further processing standard = preprocess(standard) test_morphemes = preprocess(test_morphemes) # extract morphemes from test file and morpheme-tag pairs from gold standard test_morphemes, standard = get_morphemes_and_standard(test_morphemes, standard) # partition morpheme list into sentence strings test_sentences = [sent.strip() for sent in ' '.join(test_morphemes).split(EOS) if sent] # tag sentences! v = Viterbi() # get output, storing each tag with the morpheme # that generated it test_output = [] for sentence in test_sentences: s_morphemes = tuple(sentence.split(' ')) tagged = v.tag(sentence).split(' ') test_output.extend(zip(s_morphemes, tagged)) test_report = [] errors = 0 possible = 0 # iterate through gold standard (a list containing sublists # of variable lengths corresponding to lines in the original
import sys from Viterbi import Viterbi if __name__ == '__main__': if len(sys.argv) != 3: exit(1) inputName = sys.argv[1] outputName = sys.argv[2] viterbi = Viterbi() with open(inputName, 'r') as inputFile: sentenceList = inputFile.readlines() with open(outputName, 'w', encoding='utf-8') as outputFile: for sentence in sentenceList: sentence = sentence.strip() translation = viterbi.translate(sentence) outputFile.write(translation) outputFile.write('\n')