class MainApplication(object): args = None pmi = None divisor = 10 def __init__(self, args): self.args = args self.pmi = PMILevenshtein() self.divisor = args.divisor def read_input_data(self): enc = self.args.encoding for line in self.args.infile: line = line.strip().decode(enc) if not line.count('\t')==1: continue (source, target) = line.split('\t') source = source.strip() target = target.strip() self.pmi.add_pair(source, target) def run(self): self.read_input_data() self.pmi.train() self.pmi.weights.reset_weights() minval = 0.1 for (rule, weight) in self.pmi.find_ngram_weights(n=self.args.ngram).iteritems(): (source, target) = rule if source == '<eps>': continue if weight > (len(source) * self.divisor): continue self.pmi.weights.set_weight(source, target, max((weight * 1.0 / self.divisor), minval)) if self.args.savefile: self.args.savefile.write(self.pmi.weights.make_xml_param()) def utfprint(string): print string.encode("utf-8").replace('<eps>','') for (rule, dist) in sorted(self.pmi.weights.weights.items(), key=itemgetter(1)): if rule[0] != rule[1] : utfprint("%s\t%s\t%f" % (rule[0], rule[1], dist))
def train_and_align(data, eps, log_to, args): use_keep, interspersed = args.use_keep, args.interspersed # Train PMI pmi = PMILevenshtein() pmi.epsilon = eps for (source, target) in data: pmi.add_pair(source, target) if args.param: pmi.weights.loadParamFromXMLFile(args.param) pmi_align = pmi.perform_alignments() else: pmi.train(log_to=log_to) pmi_align = pmi.alignments # Output alignments for source_target_pair in data: alignments = pmi_align[source_target_pair] yield(list(process_alignment(alignments[0], eps, keep=use_keep, interspersed=interspersed)))
def __init__(self, args): self.args = args self.pmi = PMILevenshtein() self.divisor = args.divisor