Beispiel #1
0
    def tagger(self, filename, iteration):
        print "________________________________________________________________________________________________Perceptron tagger starts"
        for iter_round in range(iteration):
            model_file = 'dumps\\model_' + str(iter_round + 1) + '.dump'
            print 'Reading from file' + '\t' + model_file.split('\\')[1]
            model = multi_class_perceptron(model_file)
            c = DataHelper()

            output = open('dumps\\dev-predicted.col', 'w')
            for sentence in c.read_sentence(filename):
                for token in sentence:
                    feature = token.feature_extracter(model.return_features)
                    score = model.weight_scores(feature)
                    predicted_tag = model.predict(score)
                    pos_tag = model.pos_constructor(token.gold_pos)
                    output.write(
                        '%s\t%s\n' %
                        (token.word, model.return_pos_reverse(predicted_tag)))

                output.write('\n')
            output.close()

            Cgold = DataHelper("dataset\\test.col")
            GoldWordTagList = Cgold.Tokenize(Cgold)

            Cpred = DataHelper("dumps\\dev-predicted.col")
            PredWordTagList = Cpred.Tokenize(Cpred)

            Ctag = DataHelper("dataset\\test.col")
            TagSet = Ctag.tagSet(Ctag)

            eval = Evaluation()
            per_tag = False
            f_measure = eval.Evaluate(per_tag, GoldWordTagList,
                                      PredWordTagList, TagSet)

            print 'F-Measure Micro:' + '\t' + f_measure[0]
            print 'F-Measure Macro:' + '\t' + f_measure[1]
            print
        final_eval = Evaluation()
        f_per_tag = True
        per_tag_table = final_eval.Evaluate(f_per_tag, GoldWordTagList,
                                            PredWordTagList, TagSet)
        print per_tag_table

        print "________________________________________________________________________________________________Perceptron tagger ends"
Beispiel #2
0
    def viterbi_tagger(self, test_file):
        print "________________________________________________________________________________________________viterbi_tagger starts"
        c_3 = DataHelper("dataset\\train.col")

        stream_emission_matrix = gzip.open("dumps\\emission_matrix.dump", 'rb')
        emission_matrix = cPickle.load(stream_emission_matrix)
        stream_emission_matrix.close()

        stream_transition_matrix = gzip.open("dumps\\transition_matrix.dump",
                                             'rb')
        transition_matrix = cPickle.load(stream_transition_matrix)
        stream_transition_matrix.close()

        for x in transition_matrix:
            for p in transition_matrix[x]:
                if transition_matrix[x][p] > 0.2:
                    print p, x, transition_matrix[x][p]

        sentence_count = 0
        word_count = 0

        output = open('dumps\\dev-predicted-viterbi.col', 'w')
        for sentence in c_3.read_sentence(test_file):
            observation = sentence.word_list()
            sentence_count += 1
            word_count += len(observation)
            #print observation
            states = sentence.tag_list()
            #print states
            #for word in observation:
            #   if word in emission_matrix:
            #        states = states + emission_matrix[word].keys()
            #    else:
            #        states = states + ['NN']

            states = list(set(states))
            #states.insert(0, '<S>')

            #start = time.time()
            prediction = self.viterbi_smoothing(observation, states,
                                                emission_matrix,
                                                transition_matrix)
            #end = time.time()
            #print 'Sentence '+str(sentence_count)+' at', end - start

            for i in range(len(prediction[0])):
                output.write('%s\t%s\n' % (prediction[1][i], prediction[0][i]))
            output.write('\n')

        output.close()

        Ctag = DataHelper("dataset\\test.col")
        TagSet = Ctag.tagSet(Ctag)

        self.table_statistics.append([
            str(emission_matrix.__len__()),
            str(len(TagSet)),
            str(word_count),
            str(sentence_count)
        ])
        table = AsciiTable(self.table_statistics)
        print(table.table)
        print "________________________________________________________________________________________________viterbi_tagger ends"

        Cgold = DataHelper("dataset\\test.col")
        GoldWordTagList = Cgold.Tokenize(Cgold)

        Cpred = DataHelper("dumps\\dev-predicted-viterbi.col")
        PredWordTagList = Cpred.Tokenize(Cpred)

        eval = Evaluation()
        per_tag = False
        f_measure = eval.Evaluate(per_tag, GoldWordTagList, PredWordTagList,
                                  TagSet)

        print 'F-Measure Micro:' + '\t' + f_measure[0]
        print 'F-Measure Macro:' + '\t' + f_measure[1]
        print

        final_eval = Evaluation()
        f_per_tag = True
        per_tag_table = final_eval.Evaluate(f_per_tag, GoldWordTagList,
                                            PredWordTagList, TagSet)
        print per_tag_table