class POS_decoder(Model1): TUNE_POS_WEIGHT = 2 def __init__(self, parameter_file): super(POS_decoder,self).__init__(parameter_file) self.tagger = PosTagger() def get_parallel_instance(self, corpus_line): (german, english) = corpus_line.strip().split(' ||| ') return ([word for word in german.split(" ")], [word for word in english.split(" ")]) def get_prior(self, **features): """ returns 1+TUNE_POS_WEIGHT if the POS tags are aligned else 1 """ return 1+self.TUNE_POS_WEIGHT*( features.get("tag_german",self.null_val)==features.get("tag_english",self.null_val)) def get_alignment(self, german, english): """ Returns Model1 alignment for a DE/EN parallel sentence pair. For each german word, identifies the best english word (or NULL) to align to Applies a prior which assigns higher probability to alignments which preserve POS tags. """ alignment = [] gtags = self.tagger.parse(german,"de") etags = self.tagger.parse(english,"en") (german,english) = ([self.get_german_stem(word).lower() for word in german], [self.get_english_stem(word).lower() for word in english]) etags.append(self.null_val) english.append(self.null_val) for (i, g_i) in enumerate(german): best = -1 bestscore = 0 for (j, e_j) in enumerate(english): val = self.get_prior(tag_german=gtags[i],tag_english=etags[j])*\ self.get_translation_prob(g_i,e_j) if best==-1 or val>bestscore: bestscore = val best = j if best < len(english)-1: yield (i,best) # don't yield anything for NULL alignment
def __init__(self, parameter_file): super(POS_decoder,self).__init__(parameter_file) self.tagger = PosTagger()