コード例 #1
0
ファイル: train_pmi.py プロジェクト: mbollmann/levenshtein
class MainApplication(object):
    args = None
    pmi  = None
    divisor = 10

    def __init__(self, args):
        self.args = args
        self.pmi = PMILevenshtein()
        self.divisor = args.divisor

    def read_input_data(self):
        enc = self.args.encoding
        for line in self.args.infile:
            line = line.strip().decode(enc)
            if not line.count('\t')==1:
                continue
            (source, target) = line.split('\t')
            source = source.strip()
            target = target.strip()
            self.pmi.add_pair(source, target)

    def run(self):
        self.read_input_data()
        self.pmi.train()

        self.pmi.weights.reset_weights()
        minval = 0.1
        for (rule, weight) in self.pmi.find_ngram_weights(n=self.args.ngram).iteritems():
            (source, target) = rule
            if source == '<eps>': continue
            if weight > (len(source) * self.divisor): continue
            self.pmi.weights.set_weight(source, target, max((weight * 1.0 / self.divisor), minval))

        if self.args.savefile:
            self.args.savefile.write(self.pmi.weights.make_xml_param())


        def utfprint(string):
            print string.encode("utf-8").replace('<eps>','')
        for (rule, dist) in sorted(self.pmi.weights.weights.items(), key=itemgetter(1)):
            if rule[0] != rule[1] :
                utfprint("%s\t%s\t%f" % (rule[0], rule[1], dist))
コード例 #2
0
ファイル: conv_norm.py プロジェクト: mbollmann/levenshtein
def train_and_align(data, eps, log_to, args):
    use_keep, interspersed = args.use_keep, args.interspersed

    # Train PMI
    pmi = PMILevenshtein()
    pmi.epsilon = eps
    for (source, target) in data:
        pmi.add_pair(source, target)
    if args.param:
        pmi.weights.loadParamFromXMLFile(args.param)
        pmi_align = pmi.perform_alignments()
    else:
        pmi.train(log_to=log_to)
        pmi_align = pmi.alignments

    # Output alignments
    for source_target_pair in data:
        alignments = pmi_align[source_target_pair]
        yield(list(process_alignment(alignments[0], eps,
                                     keep=use_keep,
                                     interspersed=interspersed)))
コード例 #3
0
ファイル: train_pmi.py プロジェクト: mbollmann/levenshtein
 def __init__(self, args):
     self.args = args
     self.pmi = PMILevenshtein()
     self.divisor = args.divisor