Example #1
0
class Tagger:
    def __init__(self, test_data_file, feature_template_list, model_file, test_output_file, word_sta = {}):
        self.test_data_file = test_data_file
        self.model = Model(feature_template_list, list(ALL_LABEL), model_file)
        self.test_output_file = test_output_file
        self.prior = word_sta

    def tag(self):
        '''
        decode a sequence
        '''
        out_file = open(self.test_output_file, 'w')
        li = 0
        correct = [0, 0]
        for (chunk, line) in read_test_data(self.test_data_file):
            observe_data = [w[0] for w in line]            
            infer_label = self.model.viterbi(observe_data, self.prior)
            if len(line) > 0 and len(line[0]) > 1:
                ideal_lable = [w[1] for w in line]
                for i in xrange(len(ideal_lable)):
                    correct[0] += 1 if ideal_lable[i] == infer_label[i] else 0
                    correct[1] += 1
            else:
                ideal_lable = ['' for l in infer_label]
            for (word, label, labelr) in zip(observe_data, infer_label, ideal_lable):
                print >> out_file, word + '\t' + label + '\t' + labelr            
            li += 1
            sys.stdout.write("tag %d sentence p(f, r) %f  \r" %(li, float(correct[0]) / correct[1]))
            sys.stdout.flush()
            print >> out_file
        print >> sys.stdout
        print >> sys.stdout, "correctness: %f" % (float(correct[0]) / correct[1])
Example #2
0
class Perceptron:
    '''
    word_sta is in the form:
    word -> tag -> count
    '''
    def __init__(self, train_data_file, feature_template_list, model_file, old_model_file = None, word_sta = {}):
        self.train_data_file = train_data_file
        self.model = Model(feature_template_list, list(ALL_LABEL), old_model_file)
        self.model_file = model_file
        self.prior = word_sta

    def train(self, iteration, keep):
        '''
        perceptron train algorithm
        '''
        for it in xrange(iteration):
            viterbi_time = 0
            update_time = 0
            on = 0
            ln = 0
            label_len = len(ALL_LABEL) ** 2
            same = [0, 0]
            print >> sys.stdout, 'perceptron iteration', it + 1
            for (chunk, line) in read_train_data(self.train_data_file):
                ln += 1
                if ln % 1000 == 0:
                    print >> sys.stdout, 'complete %d sentences in %d secs' % (ln, viterbi_time)
                if random.random() > keep:
                    continue
                observe_data = [w[0] for w in line]
                ideal_lable = [w[1] for w in line]
                start = time.clock()
                infer_label = self.model.viterbi(observe_data, self.prior)
                end = time.clock()
                ss = 0
                for li in xrange(len(ideal_lable)): ss += 1 if ideal_lable[li] == infer_label[li] else 0
                same[0] += float(ss) / len(ideal_lable)
                same[1] += 1
                on += len(observe_data) * label_len                
                viterbi_time += end - start
                start = time.clock()
                self.model.update(observe_data, ideal_lable, infer_label)
                end = time.clock()
                update_time += end - start                
            print >> sys.stdout, 'complete %d iteration in %d secs with precision %f' %(it + 1, viterbi_time, same[0] / same[1])
            print >> open(self.model_file + '.delta' + str(it + 1), 'w'), self.model
        print >> open(self.model_file, 'w'), self.model