예제 #1
0
class POS_decoder(Model1):
    TUNE_POS_WEIGHT = 2

    def __init__(self, parameter_file):
        super(POS_decoder,self).__init__(parameter_file)
        self.tagger = PosTagger()


    def get_parallel_instance(self, corpus_line):
        (german, english) = corpus_line.strip().split(' ||| ')
        return ([word for word in german.split(" ")],
                [word for word in english.split(" ")])


    def get_prior(self, **features):
        """
        returns 1+TUNE_POS_WEIGHT if the POS tags are aligned else 1
        """
        return 1+self.TUNE_POS_WEIGHT*(
            features.get("tag_german",self.null_val)==features.get("tag_english",self.null_val))


    def get_alignment(self, german, english):
        """
        Returns Model1 alignment for a DE/EN parallel sentence pair.
        For each german word, identifies the best english word (or NULL) to align to
        Applies a prior which assigns higher probability to alignments which preserve POS tags.
        """
        alignment = []
        gtags = self.tagger.parse(german,"de")
        etags = self.tagger.parse(english,"en")
        (german,english) = ([self.get_german_stem(word).lower() for word in german],
                            [self.get_english_stem(word).lower() for word in english])
        etags.append(self.null_val)
        english.append(self.null_val)
        for (i, g_i) in enumerate(german):
            best = -1
            bestscore = 0
            for (j, e_j) in enumerate(english):
                val = self.get_prior(tag_german=gtags[i],tag_english=etags[j])*\
                      self.get_translation_prob(g_i,e_j)
                if best==-1 or val>bestscore:
                    bestscore = val
                    best = j
            if best < len(english)-1:
                yield (i,best) # don't yield anything for NULL alignment
예제 #2
0
 def __init__(self, parameter_file):
     super(POS_decoder,self).__init__(parameter_file)
     self.tagger = PosTagger()