def from_file(cls, file_path): m = re.match('.*(\w\w)-(\w\w)$', file_path) lang1 = m.group(1) lang2 = m.group(2) pm = PairManager() with codecs.open(file_path) as f: na = NewAlignment.read(f) first = True for row in na: if first: pm.hapax_prob = float(row['_f'].split()[-1]) first = False continue count = int(row['_f'].split()[0]) prob = float(row['_f'].split()[-1]) pm.pairs[row[lang1], row[lang2]] = (count, prob) pm.pairs_by_prob.append((prob, row[lang1], row[lang2])) pm.pairs_by_prob.sort(reverse=True) return pm
x_longest = x y_longest = y else: M[x][y] = 0 return (longest, len(s1) - x_longest, len(s2) - y_longest) if __name__ == "__main__": from Alignment import Alignment from NewAlignment import NewAlignment langs = ("pl", "cu") # A - tested alignment tf = TextFolder("texts/kanon_izr/") aA = NewAlignment.from_old_alignment( tf.get_alignment(langs, "my"), langs, [tf.get_sentences(lang) for lang in langs] ) # B - correct alignment with open("texts/kanon_izr/everything") as f: aB = NewAlignment.read(f) baseline = NewAlignment() baseline.easy_append(pl=" ".join(tf.get_sentences("pl")), cu=" ".join(tf.get_sentences("cu"))) aB.pretty_print("pl", "cu") print evaluate_alignment(aA, aB) print evaluate_alignment(baseline, aB)