def from_folder(cls, path, langs=None): from TextFolder import TextFolder pt = TextFolder(path) if not langs: langs = ['pl', 'cu', 'el'] oa = pt.get_alignment(langs, 'my') seqs = [pt.get_sentences(lang) for lang in langs] return NewAlignment.from_old_alignment(oa, langs, seqs)
help='use file with hand-aligned sentence pairs (??-??.hand)') parser.add_argument('--plot', metavar='FILE.png', action="store", default=False, help='plots the matrix of accumulated costs') parser.add_argument('--plot-sim', metavar='FILE.png', action="store", default=False, help='plots the matrix of pair costs') parser.epilog = 'options --hand and --prealign together may cause conflicts, beware!' args = parser.parse_args() print >> sys.stderr print >> sys.stderr, ("=== Aligning %s, %s-%s ===" % (args.folder, args.lang1, args.lang2)) set_languages(args.lang1, args.lang2) tfolder = TextFolder(args.folder) t1 = map(preprocess, tfolder.get_sentences(args.lang1)) t2 = map(preprocess, tfolder.get_sentences(args.lang2)) # reading hand alignment forced_rungs = [] if args.hand: hand_alignment = tfolder.get_alignment([args.lang1, args.lang2], backend='hand') forced_rungs = hand_alignment.as_ladder() print >> sys.stderr, "%d hand-aligned pairs found." % len(forced_rungs) # prealign if args.prealign: pre_alignment = list(find_matches(t1, t2, threshold=0.5, pair_count=100)) forced_rungs.extend(pre_alignment) print >> sys.stderr, "%d sentence pairs matched." % len(pre_alignment) forced_rungs = sorted(set(forced_rungs))
x_longest = x y_longest = y else: M[x][y] = 0 return (longest, len(s1) - x_longest, len(s2) - y_longest) if __name__ == "__main__": from Alignment import Alignment from NewAlignment import NewAlignment langs = ("pl", "cu") # A - tested alignment tf = TextFolder("texts/kanon_izr/") aA = NewAlignment.from_old_alignment( tf.get_alignment(langs, "my"), langs, [tf.get_sentences(lang) for lang in langs] ) # B - correct alignment with open("texts/kanon_izr/everything") as f: aB = NewAlignment.read(f) baseline = NewAlignment() baseline.easy_append(pl=" ".join(tf.get_sentences("pl")), cu=" ".join(tf.get_sentences("cu"))) aB.pretty_print("pl", "cu") print evaluate_alignment(aA, aB) print evaluate_alignment(baseline, aB)