Esempio n. 1
0
 def from_folder(cls, path, langs=None):
     from TextFolder import TextFolder
     pt = TextFolder(path)
     if not langs:
         langs = ['pl', 'cu', 'el']
     oa = pt.get_alignment(langs, 'my')
     seqs = [pt.get_sentences(lang) for lang in langs]
     return NewAlignment.from_old_alignment(oa, langs, seqs)
Esempio n. 2
0
    args = parser.parse_args()

    print >> sys.stderr
    print >> sys.stderr, ("=== Aligning %s, %s-%s ===" %
                          (args.folder, args.lang1, args.lang2))

    set_languages(args.lang1, args.lang2)
    tfolder = TextFolder(args.folder)
    t1 = map(preprocess, tfolder.get_sentences(args.lang1))
    t2 = map(preprocess, tfolder.get_sentences(args.lang2))

    # reading hand alignment
    forced_rungs = []
    if args.hand:
        hand_alignment = tfolder.get_alignment([args.lang1, args.lang2],
                                               backend='hand')
        forced_rungs = hand_alignment.as_ladder()
        print >> sys.stderr, "%d hand-aligned pairs found." % len(forced_rungs)
    # prealign
    if args.prealign:
        pre_alignment = list(find_matches(t1, t2, threshold=0.5, pair_count=100))
        forced_rungs.extend(pre_alignment)
        print >> sys.stderr, "%d sentence pairs matched." % len(pre_alignment)
    forced_rungs = sorted(set(forced_rungs))

    try:
        a = None
        a = make_composed_alignment(t1, t2, forced_rungs)
        a = Alignment(a)
    finally:
        output_filename = '%s/%s-%s.my' % (args.folder, args.lang1, args.lang2)
Esempio n. 3
0
                    x_longest = x
                    y_longest = y
            else:
                M[x][y] = 0
    return (longest, len(s1) - x_longest, len(s2) - y_longest)


if __name__ == "__main__":
    from Alignment import Alignment
    from NewAlignment import NewAlignment

    langs = ("pl", "cu")

    # A - tested alignment
    tf = TextFolder("texts/kanon_izr/")
    aA = NewAlignment.from_old_alignment(
        tf.get_alignment(langs, "my"), langs, [tf.get_sentences(lang) for lang in langs]
    )

    # B - correct alignment
    with open("texts/kanon_izr/everything") as f:
        aB = NewAlignment.read(f)

    baseline = NewAlignment()
    baseline.easy_append(pl=" ".join(tf.get_sentences("pl")), cu=" ".join(tf.get_sentences("cu")))

    aB.pretty_print("pl", "cu")

    print evaluate_alignment(aA, aB)
    print evaluate_alignment(baseline, aB)