예제 #1
0
 def compute_accuracy(self, hmm, evaluation, ignore_tags=set([])):
     """
     Compute the accuracy of hmm tags on an
     validation dictionary.
     """
     V = Viterbi(hmm)
     accuracy = 0.0
     for item in evaluation:
         sentence, validation_tags = item
         hmm_tags = V.compute_best_parse(sentence)[1]
         accuracy += self.accuracy(hmm_tags, validation_tags, ignore_tags)
     total_accuracy = accuracy/len(evaluation)
     return total_accuracy
예제 #2
0
def infer_prepare_params(basic_or_complex, fileToInfer):
    train_parser = MyParser("../train.wtag")
    seenWordsToTagsDict = train_parser.getSeenWordsToTagsDict()
    fb, filePrefix = None, None
    if basic_or_complex == 'basic':
        fb = BasicFeatureVectorBuilder(train_parser, 0)
        filePrefix = 'finish_basic_opt_v_'
    elif basic_or_complex == 'complex':
        fb = ComplexFeatureVectorBuilder(train_parser, False)
        filePrefix = 'finish_complex_opt_v_'
    else:
        assert (False)
    fn = str(fileToInfer).replace('.', '').replace('/', '')
    parser = MyParser(fileToInfer)
    splitted = parser.splitted
    mle = MLE(train_parser.getUniqueTags(), splitted, fb)

    prefixed = [
        filename for filename in os.listdir('.')
        if filename.startswith(filePrefix)
    ]
    prefixed.sort()
    print(prefixed)
    results = []

    for v_file in prefixed:
        v = np.loadtxt(v_file)
        vit = Viterbi(mle, mle.allTags, v, seenWordsToTagsDict)
        res_file = open(fn + "_results_" + v_file, 'w')
        exp_file = open(fn + "_expected_" + v_file, 'w')
        accuracy = infer_aux(exp_file, res_file, v_file, splitted, vit)
        res_file.close()
        exp_file.close()
        results = results + [accuracy]
    infer_aux_results(prefixed, results, fileToInfer, fn)
예제 #3
0
 def __init__(self, language):
     self.total_labels = []
     self.klasses = []
     self.language = language
     self.train_sentences = []
     self.test_sentenses = []
     self.factory = FeatureFactory()
     self.viterbi = Viterbi()
예제 #4
0
def train():
    train_parser = MyParser("../train.wtag")
    seenSentencesToTagsDict = train_parser.getSeenWordsToTagsDict()
    parser = MyParser("../comp748.wtag")
    splitted = parser.splitted
    fb = BasicFeatureVectorBuilder(parser,0)
    mle = MLE(parser.getUniqueTags(), splitted, fb)
    v = np.loadtxt("opt_v_3.txt")
    sentences = list(map(lambda tuples: [t[0] for t in tuples], splitted))
    expected_tags = list(map(lambda tuples: [t[1] for t in tuples], splitted))
    seenSentencesToTagsDict = parser.getSeenWordsToTagsDict()
    vit = Viterbi(mle, mle.allTags, v, seenSentencesToTagsDict)
    total_res = 0
    words_count = 0
    total_time = 0
    for s,expected,idx in zip(sentences,expected_tags,range(0,len(splitted))):
        curr_word_len = len(s)
        words_count = words_count + curr_word_len
        start = time.time()
        tags = vit.inference(s)

        res_file = open("test_wtag748_results.txt",'a')
        for item in tags:
            res_file.write("%s " % item)
        res_file.write("\n")
        res_file.close()

        exp_file = open("test_wtag748_expected.txt", 'a')
        for item in expected:
            exp_file.write("%s " % item)
        exp_file.write("\n")
        exp_file.close()

        stop = time.time()
        e = np.array([hash(x) for x in expected])
        t = np.array([hash(x) for x in tags])
        current_correct = np.sum(e == t)
        print("---------------------")
        print("Inference for sentence# ", idx, " took: ", stop - start, " seconds")
        total_time = total_time + (stop-start)
        print("Current sentence accuracy: ", current_correct, " of: ", curr_word_len)
        total_res = total_res + current_correct
        print("Total sentence accuracy: ", total_res, " of: ", words_count, "=", (100*total_res)/words_count, "%")
        print("Total time for ", idx, " sentences: ", (total_time / 60), " minutes")
예제 #5
0
 def load_model(self):
   print("Loading model")
   model = dill.load(open(f"{config.MODEL}/model.dill", 'rb'))
   words = model['words']
   words_inverse = model['words_inverse']
   tree = model['tree']
   viterbi = Viterbi(words, words_inverse, tree)
   print("Ready.")
   self.viterbi = viterbi
   self.words = words
   self.words_inverse = words_inverse
   self.tree = tree
예제 #6
0
    def run_viterbi(self, test_data, training_data, pass_number):

        wrong_count = 0
        total_observations = 0

        #Removing tags in split test data until start of new sentence.
        #print test_data
        for j in test_data:
            if test_data[0] == ('SOS', 'SOS'):
                print('Found SOS tag, continuing with test data.')
                break
            else:
                print('Removing item: %s' % ' / '.join(test_data[0]))
                test_data.pop(0)

        stripped_test_data = [x[0] for x in test_data]

        print("Test data length: %s" % len(stripped_test_data))

        word_table = ProbabilityCounter().generate_word_pr_table(
            training_data, 1)
        cat_table = ProbabilityCounter().generate_cat_pr_table(
            training_data, 1)
        tag_list = list(cat_table.columns.values)

        #print cat_table.to_string()
        #print word_table.to_string()
        #print 'Tag List: %s' % tag_list

        print('Current WrongCount: %s' % wrong_count)
        print('Current Total Tags: %s' % total_observations)
        if total_observations > 0:
            print(
                'Current Tagging Accuracy: %.2f%%' %
                ((1 - float(wrong_count) / total_observations) * 100)
            )  #TODO True accuracy measure may need to remove the added SOS tags

        print('Running Viterbi algorithm pass number: %d' % (pass_number + 1))
        opt = Viterbi().tagger_updated(stripped_test_data, tag_list, cat_table,
                                       word_table)

        true_tags = [x[1] for x in test_data]
        predicted_tags = [x[1] for x in opt]

        print(true_tags)
        print(predicted_tags)

        for j in range(0, len(predicted_tags)):
            if predicted_tags[j] != true_tags[j]:
                wrong_count += 1
        total_observations += len(predicted_tags)

        return total_observations, wrong_count
예제 #7
0
class Perceptron:
    
    def __init__(self, language):
        self.total_labels = []
        self.klasses = []
        self.language = language
        self.train_sentences = []
        self.test_sentenses = []
        self.factory = FeatureFactory()
        self.viterbi = Viterbi()

    def read_data(self, train_file, test_file):
        self.read_training_data(train_file)
        self.read_testing_data(test_file)
    
    def read_training_data(self, train_file):
        list_of_training_instances = []
        new_sentence = Sentence()
        for line in train_file:      
            split = line.strip().split()
            if len(split) == 0 and new_sentence.size() != 0:
                if '-DOCSTART-' not in new_sentence.full_sentence:
                    self.train_sentences.append(new_sentence)
                new_sentence = Sentence()
            else:
                instance = EngInstance(split[0], split[1], split[2], split[3])
                list_of_training_instances.append(instance)
                new_sentence.add(instance)
                if split[3] not in self.total_labels:
                    self.total_labels.append(split[3])

        print 'total number of training instances',len(list_of_training_instances), \
                'total number of training sentences', len(self.train_sentences)  

        self.klasses_init()
        self.viterbi.train(self.total_labels, self.train_sentences)

    def klasses_init(self):
        for label in self.total_labels:
            self.klasses.append(Klass(label))

    def tag_klass(self, tag):
        for klass in self.klasses:
            if klass.tag == tag:
                return klass
        return None                

    def read_testing_data(self, test_file):
        list_of_testing_instances = []
        new_sentence = Sentence()
        for line in test_file:      
            split = line.strip().split()
            if len(split) == 0 and new_sentence.size() != 0:
                if '-DOCSTART-' not in new_sentence.full_sentence:
                    self.test_sentenses.append(new_sentence)
                new_sentence = Sentence()
            else:
                instance = EngInstance(split[0], split[1], split[2], split[3])
                list_of_testing_instances.append(instance)
                new_sentence.add(instance)

        print 'total number of testing instances',len(list_of_testing_instances), \
                'total number of testing sentences', len(self.test_sentenses)

    def computeFeatures(self):
        for sentence in self.train_sentences:
            self.factory.compute_sentence_features_eng(sentence)
        for sentence in self.test_sentenses:
            self.factory.compute_sentence_features_eng(sentence)

    def train(self):
        iteration = 0
        total = len(self.train_sentences)
        while iteration < 10:
            error = 0
            for i in range(len(self.train_sentences)):
                sentence = self.train_sentences[i] 
                path = self.classify(sentence)
                for index in range(len(sentence.instances)):
                    instance = sentence.instances[index]
                    if path[index] == instance.label:
                        instance.predicted_label = instance.label
                    else:
                        guess = self.tag_klass(path[index])
                        instance.predicted_label = path[index]
                        gold = self.tag_klass(instance.label)
                        error += 1
                        guess.adjust(instance.features, '-')
                        gold.adjust(instance.features, '+')
                self.factory.features_update(sentence)
                for klass in self.klasses:
                    klass.update()
            iteration += 1
            print 'Iteration %d: number of errors %d' % (iteration, error)
        for klass in self.klasses:
            klass.average_weights()                

    def classify(self, sentence):
        return self.viterbi.viterbi(sentence, self.klasses)
        

    def test(self):
        correct = 0
        wrong = 0
        report_summary = defaultdict(lambda:0)
        
        for i in range(len(self.train_sentences)):
            sentence = self.train_sentences[i] 
            path = self.classify(sentence)
            for index in range(len(sentence.instances)):
                instance = sentence.instances[index]
                instance.predicted_label = path[index]
            self.factory.features_update(sentence)
        
        for sentence in self.test_sentenses:
            path = self.classify(sentence)
            for index in range(len(sentence.instances)):
                instance = sentence.instances[index]
                guess = self.tag_klass(path[index])
                gold = self.tag_klass(instance.label)
                report_summary[(gold.tag, guess.tag)] += 1
                if guess.tag != gold.tag:
                    gold.FN += 1
                    guess.FP += 1
                    wrong += 1
                else:
                    gold.TP += 1
                    if guess.tag != 'O':
                        correct += 1

        for label_1 in self.total_labels:
            print label_1, "&",
        print    
        for label_1 in self.total_labels:
            print label_1, 
            for label_2 in self.total_labels:
                print "&", report_summary[(label_1, label_2)],
            print "\\\\ \\hline"
        print correct, wrong
        for klass in self.klasses:
            try:
                P = float(klass.TP)/(klass.TP + klass.FP) 
            except:
                P = 0
            try:        
                R = float(klass.TP)/(klass.TP + klass.FN) 
            except:
                R = 0
            try:        
                F = 2 * P * R /(P + R) * 100
            except:
                F = 0
            print "%s & %.2f & %.2f & %.2f" % (klass.tag, P * 100, R * 100, F)    
예제 #8
0
파일: testCRF.py 프로젝트: iammrhelo/MLDS
            d[ ph48 ] = char
    return d


PhoneMapIdxtoPh48 = load_liststateto48()
PhoneMap48to39 = load_dict_48to39()
PhoneMap39toChr = load_dict_48toChr()

xs,IDs_utter = read_test()

idNphrase=[]
for idx in xrange(0,len(xs)):
    x = xs[idx]
    id_utter = IDs_utter[idx][0]
    y_hat = [0]*len(x)
    V = Viterbi (x , w , y_class , y_hat , 0)
    start = time.clock()
    y_tilde = V.main_Viterbi()
    end = time.clock()
    print "Viterbi time :" , end-start
    print "tilde " , np.dot(w.T , Psi( x , y_tilde ))
    y_temp = [PhoneMap48to39[ PhoneMapIdxtoPh48[int(ph)]] for ph in y_tilde]
    smooth_y = []
    smooth_y1 = []
    smooth_y1.append(y_temp[0])
    for i in xrange(1,len(y_temp)-1):
        if y_temp[i-1] == y_temp[i+1] and \
           y_temp[i]   != y_temp[i-1]:
            smooth_y1.append( y_temp[i-1] )
        elif y_temp[i] != y_temp[i-1] and \
             y_temp[i] != y_temp[i+1]:
예제 #9
0
파일: testCRF.py 프로젝트: ChunHungLiu/MLDS
            d[ph48] = char
    return d


PhoneMapIdxtoPh48 = load_liststateto48()
PhoneMap48to39 = load_dict_48to39()
PhoneMap39toChr = load_dict_48toChr()

xs, IDs_utter = read_test()

idNphrase = []
for idx in xrange(0, len(xs)):
    x = xs[idx]
    id_utter = IDs_utter[idx][0]
    y_hat = [0] * len(x)
    V = Viterbi(x, w, y_class, y_hat, 0)
    start = time.clock()
    y_tilde = V.main_Viterbi()
    end = time.clock()
    print "Viterbi time :", end - start
    print "tilde ", np.dot(w.T, Psi(x, y_tilde))
    y_temp = [PhoneMap48to39[PhoneMapIdxtoPh48[int(ph)]] for ph in y_tilde]
    smooth_y = []
    smooth_y1 = []
    smooth_y1.append(y_temp[0])
    for i in xrange(1, len(y_temp) - 1):
        if y_temp[i-1] == y_temp[i+1] and \
           y_temp[i]   != y_temp[i-1]:
            smooth_y1.append(y_temp[i - 1])
        elif y_temp[i] != y_temp[i-1] and \
             y_temp[i] != y_temp[i+1]:
예제 #10
0
        'HOT': {
            'HOT': 0.7,
            'COLD': 0.3
        },
        'COLD': {
            'HOT': 0.4,
            'COLD': 0.6
        }
    }
    emission = {
        'HOT': {
            '1': 0.2,
            '2': 0.4,
            '3': 0.4
        },
        'COLD': {
            '1': 0.5,
            '2': 0.4,
            '3': 0.1
        }
    }
    processor = Viterbi(sequence, states, initial, transition, emission)
    result = processor.process()
    resultString = ''
    for r in result:
        if r == 'HOT':
            resultString += 'H'
        if r == 'COLD':
            resultString += 'C'

    print "The Weather forecast for Observation : ", sequence, "is", resultString
예제 #11
0
    # clean up test file and gold standard data for
    # further processing
    standard = preprocess(standard)
    test_morphemes = preprocess(test_morphemes)

    # extract morphemes from test file and morpheme-tag pairs from gold standard
    test_morphemes, standard = get_morphemes_and_standard(
        test_morphemes, standard)

    # partition morpheme list into sentence strings
    test_sentences = [
        sent.strip() for sent in ' '.join(test_morphemes).split(EOS) if sent
    ]

    # tag sentences!
    v = Viterbi()

    # get output, storing each tag with the morpheme
    # that generated it
    test_output = []
    for sentence in test_sentences:
        s_morphemes = tuple(sentence.split(' '))
        tagged = v.tag(sentence).split(' ')
        test_output.extend(zip(s_morphemes, tagged))

    test_report = []
    errors = 0
    possible = 0

    # iterate through gold standard (a list containing sublists
    # of variable lengths corresponding to lines in the original
예제 #12
0
testing_infrequent_words = dataPreProcessor.identify_infrequent_words_in_testing_corpus(
)
testSet = dataPreProcessor.tag_capital_words(testing_infrequent_words, testSet)
testSet = dataPreProcessor.tag_UNI_ing_words(testing_infrequent_words, testSet)
testSet = dataPreProcessor.tag_numbers(testing_infrequent_words, testSet)

# create an instance of the HHM and passed the training set to generate its parameters.
hiddenMarkovModel = HiddenMarkovModel(testSet)
hiddenMarkovModel.calculate_transition_prob_for_POS_tags()
hiddenMarkovModel.calculate_emission_prob()

unified_test_set = [tup for sent in testSet for tup in sent]
test_set_tags = [t for (_, t) in unified_test_set]

viterbi = Viterbi(hiddenMarkovModel)
viterbi_tags = []
for test in testSet:
    if len(test) < 100:
        test_observations = [w for (w, _) in test]
        viterbi_tags += viterbi.tag_words(test_observations)

check = [
    v_tag for v_tag, t_tag in zip(viterbi_tags, test_set_tags)
    if v_tag == t_tag
]
viterbi_accuracy = len(check) / len(test_set_tags)

print("Initial tags", len(test_set_tags))
print("Correct tags", len(check))
print("Percentage", viterbi_accuracy * 100)
# Divide testing and training corpus
trainSetSize = 10000
testingSetSize = 500

sentences = brown.tagged_sents(tagset='universal')
trainSet = sentences[0:trainSetSize]
testSet = sentences[trainSetSize:trainSetSize + testingSetSize]  # Continue from where the training set stopped.

# create an instance of the HHM and passed the training set to generate its parameters.
hiddenMarkovModel = HiddenMarkovModel(trainSet)
hiddenMarkovModel.calculate_transition_prob_for_POS_tags()
hiddenMarkovModel.calculate_emission_prob()

unified_test_set = [tup for sent in testSet for tup in sent]
test_set_tags = [t for (_, t) in unified_test_set]
viterbi = Viterbi(hiddenMarkovModel)
viterbi_tags = []

for test in testSet:
    if len(test) < 100:
        test_observations = [w for (w, _) in test]
        viterbi_tags += viterbi.tag_words(test_observations)


check = [v_tag for v_tag, t_tag in zip(viterbi_tags, test_set_tags) if v_tag == t_tag]
viterbi_accuracy = len(check)/len(test_set_tags)

print("Correct tags", len(check))
print("Accuracy", viterbi_accuracy * 100)
print("Final Probability", viterbi.get_final_prob())
print(test_set_tags)
예제 #14
0
points = simulation.generatePointsFromStates(simulationStates, 5)

diff = simulation.generateDiffFromPoints(points['x'], points['y'])

f, axarr = plt.subplots(3)
axarr[0].bar(range(0, 100), points['x'], color='blue')
axarr[1].bar(range(0, 100), points['y'], color='red')
axarr[2].bar(range(0, 100), diff, color='green')
plt.show()

observations = []
for i,j in zip(points['x'], points['y']):
  observations.append((i, j))

viterbi = Viterbi(observations, {
    0: {0: .9, 1: .05, 2: .05},
    1: {0: .05, 1: .9, 2: .05},
    2: {0: .05, 1: .05, 2: .9}
  }, (0, 1, 2), 2)

viterbiStates = viterbi.viterbi()
print viterbiStates

count = 0
for i,j in zip(simulationStates, viterbiStates):
  if i == j:
    count += 1

print count/float(len(simulationStates))
예제 #15
0
		formattedTestFile.write("\n")
	else:
		if line == "+":
			string = "+ "
		elif "+" in line:
			morphemeList = line.split("+")
			string = " ".join(morpheme for morpheme in morphemeList) + " "
		else:
			string = line + " "
		formattedTestFile.write(string)
testFile.close()
formattedTestFile.close()

# Import and run Viterbi algorithm, create list of morphemes in test file
from Viterbi import Viterbi
v = Viterbi()
formattedTestFile = open("korean-testing-formatted.txt","r")
viterbiOutputFile = open("viterbi-out.txt","w+")
testMorphemeList = []

for line in formattedTestFile.readlines():
	line = line.rstrip()
	string = v.tag(line)
	viterbiOutputFile.write(string)
	morphemes = line.split(" ")
	for morpheme in morphemes:
		testMorphemeList.append(morpheme)
formattedTestFile.close()
viterbiOutputFile.close()

# Create list of tags in Viterbi test output
예제 #16
0
    # clean up test file and gold standard data for 
    # further processing
    standard = preprocess(standard)
    test_morphemes = preprocess(test_morphemes)

    # extract morphemes from test file and morpheme-tag pairs from gold standard
    test_morphemes, standard = get_morphemes_and_standard(test_morphemes, standard)

    # partition morpheme list into sentence strings
    test_sentences = [sent.strip() for sent in
                      ' '.join(test_morphemes).split(EOS) if sent]


    # tag sentences!
    v = Viterbi()

    # get output, storing each tag with the morpheme
    # that generated it
    test_output = []
    for sentence in test_sentences:
        s_morphemes = tuple(sentence.split(' '))
        tagged = v.tag(sentence).split(' ')
        test_output.extend(zip(s_morphemes, tagged))

    test_report = []
    errors = 0
    possible = 0

    # iterate through gold standard (a list containing sublists
    # of variable lengths corresponding to lines in the original
예제 #17
0
import sys
from Viterbi import Viterbi


if __name__ == '__main__':
    if len(sys.argv) != 3:
        exit(1)
    inputName = sys.argv[1]
    outputName = sys.argv[2]

    viterbi = Viterbi()

    with open(inputName, 'r') as inputFile:
        sentenceList = inputFile.readlines()
        
    with open(outputName, 'w', encoding='utf-8') as outputFile:
        for sentence in sentenceList:
            sentence = sentence.strip()
            translation = viterbi.translate(sentence)
            outputFile.write(translation)
            outputFile.write('\n')