Esempio n. 1
0
 def from_folder(cls, path, langs=None):
     from TextFolder import TextFolder
     pt = TextFolder(path)
     if not langs:
         langs = ['pl', 'cu', 'el']
     oa = pt.get_alignment(langs, 'my')
     seqs = [pt.get_sentences(lang) for lang in langs]
     return NewAlignment.from_old_alignment(oa, langs, seqs)
Esempio n. 2
0
    parser.add_argument('--hand', action="store_true", default=None,
                        help='use file with hand-aligned sentence pairs (??-??.hand)')
    parser.add_argument('--plot', metavar='FILE.png', action="store", default=False,
                        help='plots the matrix of accumulated costs')
    parser.add_argument('--plot-sim', metavar='FILE.png', action="store", default=False,
                        help='plots the matrix of pair costs')
    parser.epilog = 'options --hand and --prealign together may cause conflicts, beware!'

    args = parser.parse_args()

    print >> sys.stderr
    print >> sys.stderr, ("=== Aligning %s, %s-%s ===" %
                          (args.folder, args.lang1, args.lang2))

    set_languages(args.lang1, args.lang2)
    tfolder = TextFolder(args.folder)
    t1 = map(preprocess, tfolder.get_sentences(args.lang1))
    t2 = map(preprocess, tfolder.get_sentences(args.lang2))

    # reading hand alignment
    forced_rungs = []
    if args.hand:
        hand_alignment = tfolder.get_alignment([args.lang1, args.lang2],
                                               backend='hand')
        forced_rungs = hand_alignment.as_ladder()
        print >> sys.stderr, "%d hand-aligned pairs found." % len(forced_rungs)
    # prealign
    if args.prealign:
        pre_alignment = list(find_matches(t1, t2, threshold=0.5, pair_count=100))
        forced_rungs.extend(pre_alignment)
        print >> sys.stderr, "%d sentence pairs matched." % len(pre_alignment)
Esempio n. 3
0
                    longest = M[x][y]
                    x_longest = x
                    y_longest = y
            else:
                M[x][y] = 0
    return (longest, len(s1) - x_longest, len(s2) - y_longest)


if __name__ == "__main__":
    from Alignment import Alignment
    from NewAlignment import NewAlignment

    langs = ("pl", "cu")

    # A - tested alignment
    tf = TextFolder("texts/kanon_izr/")
    aA = NewAlignment.from_old_alignment(
        tf.get_alignment(langs, "my"), langs, [tf.get_sentences(lang) for lang in langs]
    )

    # B - correct alignment
    with open("texts/kanon_izr/everything") as f:
        aB = NewAlignment.read(f)

    baseline = NewAlignment()
    baseline.easy_append(pl=" ".join(tf.get_sentences("pl")), cu=" ".join(tf.get_sentences("cu")))

    aB.pretty_print("pl", "cu")

    print evaluate_alignment(aA, aB)
    print evaluate_alignment(baseline, aB)